// -*- mode:c++ -*-

// Copyright (c) 2010-2011, 2015, 2019, 2024-2025 Arm Limited
// All rights reserved
//
// The license below extends only to copyright in the software and shall
// not be construed as granting a license to any other intellectual
// property including but not limited to intellectual property relating
// to a hardware implementation of the functionality of the software
// licensed hereunder.  You may use the software subject to the license
// terms below provided that you ensure that this notice is replicated
// unmodified and in its entirety in all distributions of the software,
// modified or unmodified, in source code or in binary form.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met: redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer;
// redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution;
// neither the name of the copyright holders nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

output header {{
    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUThreeUReg(unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        switch (size) {
          case 0:
            return new Base<uint8_t>(machInst, dest, op1, op2);
          case 1:
            return new Base<uint16_t>(machInst, dest, op1, op2);
          case 2:
            return new Base<uint32_t>(machInst, dest, op1, op2);
          case 3:
            return new Base<uint64_t>(machInst, dest, op1, op2);
          default:
            return new Unknown(machInst);
        }
    }

    template <class BaseS, class BaseD>
    StaticInstPtr
    decodeNeonSizeSingleDouble(unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        switch (size) {
          case 2:
            return new BaseS(machInst, dest, op1, op2);
          case 3:
            return new BaseD(machInst, dest, op1, op2);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSThreeUReg(unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        switch (size) {
          case 0:
            return new Base<int8_t>(machInst, dest, op1, op2);
          case 1:
            return new Base<int16_t>(machInst, dest, op1, op2);
          case 2:
            return new Base<int32_t>(machInst, dest, op1, op2);
          case 3:
            return new Base<int64_t>(machInst, dest, op1, op2);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUSThreeUReg(bool notSigned, unsigned size,
                          ExtMachInst machInst, RegIndex dest,
                          RegIndex op1, RegIndex op2)
    {
        if (notSigned) {
            return decodeNeonUThreeUReg<Base>(size, machInst, dest, op1, op2);
        } else {
            return decodeNeonSThreeUReg<Base>(size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUThreeUSReg(unsigned size,
                          ExtMachInst machInst, RegIndex dest,
                          RegIndex op1, RegIndex op2)
    {
        switch (size) {
          case 0:
            return new Base<uint8_t>(machInst, dest, op1, op2);
          case 1:
            return new Base<uint16_t>(machInst, dest, op1, op2);
          case 2:
            return new Base<uint32_t>(machInst, dest, op1, op2);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSThreeUSReg(unsigned size,
                          ExtMachInst machInst, RegIndex dest,
                          RegIndex op1, RegIndex op2)
    {
        switch (size) {
          case 0:
            return new Base<int8_t>(machInst, dest, op1, op2);
          case 1:
            return new Base<int16_t>(machInst, dest, op1, op2);
          case 2:
            return new Base<int32_t>(machInst, dest, op1, op2);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSThreeHAndWReg(unsigned size, ExtMachInst machInst,
                             RegIndex dest, RegIndex op1,
                             RegIndex op2)
    {
        switch (size) {
          case 1:
            return new Base<int16_t>(machInst, dest, op1, op2);
          case 2:
            return new Base<int32_t>(machInst, dest, op1, op2);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSThreeImmHAndWReg(unsigned size, ExtMachInst machInst,
                                RegIndex dest, RegIndex op1,
                                RegIndex op2, uint64_t imm)
    {
        switch (size) {
          case 1:
            return new Base<int16_t>(machInst, dest, op1, op2, imm);
          case 2:
            return new Base<int32_t>(machInst, dest, op1, op2, imm);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUSThreeUSReg(bool notSigned, unsigned size,
                           ExtMachInst machInst, RegIndex dest,
                           RegIndex op1, RegIndex op2)
    {
        if (notSigned) {
            return decodeNeonUThreeUSReg<Base>(
                    size, machInst, dest, op1, op2);
        } else {
            return decodeNeonSThreeUSReg<Base>(
                    size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUThreeSReg(bool q, unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        if (q) {
            return decodeNeonUThreeUSReg<BaseQ>(
                    size, machInst, dest, op1, op2);
        } else {
            return decodeNeonUThreeUSReg<BaseD>(
                    size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSThreeSReg(bool q, unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        if (q) {
            return decodeNeonSThreeUSReg<BaseQ>(
                    size, machInst, dest, op1, op2);
        } else {
            return decodeNeonSThreeUSReg<BaseD>(
                    size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSThreeXReg(bool q, unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        if (q) {
            return decodeNeonSThreeUReg<BaseQ>(
                    size, machInst, dest, op1, op2);
        } else {
            return decodeNeonSThreeUSReg<BaseD>(
                    size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUThreeXReg(bool q, unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        if (q) {
            return decodeNeonUThreeUReg<BaseQ>(
                    size, machInst, dest, op1, op2);
        } else {
            return decodeNeonUThreeUSReg<BaseD>(
                    size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUSThreeSReg(bool q, bool notSigned, unsigned size,
                          ExtMachInst machInst, RegIndex dest,
                          RegIndex op1, RegIndex op2)
    {
        if (notSigned) {
            return decodeNeonUThreeSReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, op2);
        } else {
            return decodeNeonSThreeSReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUThreeReg(bool q, unsigned size,
                        ExtMachInst machInst, RegIndex dest,
                        RegIndex op1, RegIndex op2)
    {
        if (q) {
            return decodeNeonUThreeUReg<BaseQ>(
                    size, machInst, dest, op1, op2);
        } else {
            return decodeNeonUThreeUReg<BaseD>(
                    size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSThreeReg(bool q, unsigned size,
                        ExtMachInst machInst, RegIndex dest,
                        RegIndex op1, RegIndex op2)
    {
        if (q) {
            return decodeNeonSThreeUReg<BaseQ>(
                    size, machInst, dest, op1, op2);
        } else {
            return decodeNeonSThreeUReg<BaseD>(
                    size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUSThreeReg(bool q, bool notSigned, unsigned size,
                         ExtMachInst machInst, RegIndex dest,
                         RegIndex op1, RegIndex op2)
    {
        if (notSigned) {
            return decodeNeonUThreeReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, op2);
        } else {
            return decodeNeonSThreeReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUThreeFpReg(bool q, unsigned size, ExtMachInst machInst,
                          RegIndex dest, RegIndex op1, RegIndex op2)
    {
        if (q) {
            if (size)
                return new BaseQ<uint64_t>(machInst, dest, op1, op2);
            else
                return new BaseQ<uint32_t>(machInst, dest, op1, op2);
        } else {
            if (size)
                return new Unknown(machInst);
            else
                return new BaseD<uint32_t>(machInst, dest, op1, op2);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUThreeScFpReg(bool size, ExtMachInst machInst,
                            RegIndex dest, RegIndex op1, RegIndex op2)
    {
        if (size)
            return new Base<uint64_t>(machInst, dest, op1, op2);
        else
            return new Base<uint32_t>(machInst, dest, op1, op2);
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUThreeImmScFpReg(uint8_t size, ExtMachInst machInst,
                               RegIndex dest, RegIndex op1,
                               RegIndex op2, uint64_t imm)
    {
        switch (size) {
          case 0b11:
            return new Base<uint64_t>(machInst, dest, op1, op2, imm);
          case 0b10:
            return new Base<uint32_t>(machInst, dest, op1, op2, imm);
          case 0b00:
            return new Base<uint16_t>(machInst, dest, op1, op2, imm);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUThreeImmHAndWReg(bool q, unsigned size, ExtMachInst machInst,
                                RegIndex dest, RegIndex op1,
                                RegIndex op2, uint64_t imm)
    {
        if (q) {
            switch (size) {
              case 1:
                return new BaseQ<uint16_t>(machInst, dest, op1, op2, imm);
              case 2:
                return new BaseQ<uint32_t>(machInst, dest, op1, op2, imm);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 1:
                return new BaseD<uint16_t>(machInst, dest, op1, op2, imm);
              case 2:
                return new BaseD<uint32_t>(machInst, dest, op1, op2, imm);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSThreeImmHAndWReg(bool q, unsigned size, ExtMachInst machInst,
                                RegIndex dest, RegIndex op1,
                                RegIndex op2, uint64_t imm)
    {
        if (q) {
            switch (size) {
              case 1:
                return new BaseQ<int16_t>(machInst, dest, op1, op2, imm);
              case 2:
                return new BaseQ<int32_t>(machInst, dest, op1, op2, imm);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 1:
                return new BaseD<int16_t>(machInst, dest, op1, op2, imm);
              case 2:
                return new BaseD<int32_t>(machInst, dest, op1, op2, imm);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUThreeImmFpReg(bool q, unsigned size, ExtMachInst machInst,
                             RegIndex dest, RegIndex op1,
                             RegIndex op2, uint64_t imm)
    {
        if (q) {
            switch (size) {
              case 0b11:
                return new BaseQ<uint64_t>(machInst, dest, op1, op2, imm);
              case 0b10:
                return new BaseQ<uint32_t>(machInst, dest, op1, op2, imm);
              case 0b00:
                return new BaseQ<uint16_t>(machInst, dest, op1, op2, imm);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0b10:
                return new BaseD<uint32_t>(machInst, dest, op1, op2, imm);
              case 0b00:
                return new BaseD<uint16_t>(machInst, dest, op1, op2, imm);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoShiftReg(bool q, unsigned size,
                           ExtMachInst machInst, RegIndex dest,
                           RegIndex op1, uint64_t imm)
    {
        if (q) {
            switch (size) {
              case 0:
                return new BaseQ<uint8_t>(machInst, dest, op1, imm);
              case 1:
                return new BaseQ<uint16_t>(machInst, dest, op1, imm);
              case 2:
                return new BaseQ<uint32_t>(machInst, dest, op1, imm);
              case 3:
                return new BaseQ<uint64_t>(machInst, dest, op1, imm);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0:
                return new BaseD<uint8_t>(machInst, dest, op1, imm);
              case 1:
                return new BaseD<uint16_t>(machInst, dest, op1, imm);
              case 2:
                return new BaseD<uint32_t>(machInst, dest, op1, imm);
              case 3:
                return new BaseD<uint64_t>(machInst, dest, op1, imm);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSTwoShiftReg(bool q, unsigned size,
                           ExtMachInst machInst, RegIndex dest,
                           RegIndex op1, uint64_t imm)
    {
        if (q) {
            switch (size) {
              case 0:
                return new BaseQ<int8_t>(machInst, dest, op1, imm);
              case 1:
                return new BaseQ<int16_t>(machInst, dest, op1, imm);
              case 2:
                return new BaseQ<int32_t>(machInst, dest, op1, imm);
              case 3:
                return new BaseQ<int64_t>(machInst, dest, op1, imm);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0:
                return new BaseD<int8_t>(machInst, dest, op1, imm);
              case 1:
                return new BaseD<int16_t>(machInst, dest, op1, imm);
              case 2:
                return new BaseD<int32_t>(machInst, dest, op1, imm);
              case 3:
                return new BaseD<int64_t>(machInst, dest, op1, imm);
              default:
                return new Unknown(machInst);
            }
        }
    }


    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUSTwoShiftReg(bool q, bool notSigned, unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1, uint64_t imm)
    {
        if (notSigned) {
            return decodeNeonUTwoShiftReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, imm);
        } else {
            return decodeNeonSTwoShiftReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, imm);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUTwoShiftUSReg(unsigned size,
                             ExtMachInst machInst, RegIndex dest,
                             RegIndex op1, uint64_t imm)
    {
        switch (size) {
          case 0:
            return new Base<uint8_t>(machInst, dest, op1, imm);
          case 1:
            return new Base<uint16_t>(machInst, dest, op1, imm);
          case 2:
            return new Base<uint32_t>(machInst, dest, op1, imm);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUTwoShiftUReg(unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1, uint64_t imm)
    {
        switch (size) {
          case 0:
            return new Base<uint8_t>(machInst, dest, op1, imm);
          case 1:
            return new Base<uint16_t>(machInst, dest, op1, imm);
          case 2:
            return new Base<uint32_t>(machInst, dest, op1, imm);
          case 3:
            return new Base<uint64_t>(machInst, dest, op1, imm);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSTwoShiftUReg(unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1, uint64_t imm)
    {
        switch (size) {
          case 0:
            return new Base<int8_t>(machInst, dest, op1, imm);
          case 1:
            return new Base<int16_t>(machInst, dest, op1, imm);
          case 2:
            return new Base<int32_t>(machInst, dest, op1, imm);
          case 3:
            return new Base<int64_t>(machInst, dest, op1, imm);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoShiftSReg(bool q, unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1, uint64_t imm)
    {
        if (q) {
            return decodeNeonUTwoShiftUSReg<BaseQ>(
                    size, machInst, dest, op1, imm);
        } else {
            return decodeNeonUTwoShiftUSReg<BaseD>(
                    size, machInst, dest, op1, imm);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSTwoShiftUSReg(unsigned size,
                             ExtMachInst machInst, RegIndex dest,
                             RegIndex op1, uint64_t imm)
    {
        switch (size) {
          case 0:
            return new Base<int8_t>(machInst, dest, op1, imm);
          case 1:
            return new Base<int16_t>(machInst, dest, op1, imm);
          case 2:
            return new Base<int32_t>(machInst, dest, op1, imm);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSTwoShiftSReg(bool q, unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1, uint64_t imm)
    {
        if (q) {
            return decodeNeonSTwoShiftUSReg<BaseQ>(
                    size, machInst, dest, op1, imm);
        } else {
            return decodeNeonSTwoShiftUSReg<BaseD>(
                    size, machInst, dest, op1, imm);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUSTwoShiftSReg(bool q, bool notSigned, unsigned size,
                             ExtMachInst machInst, RegIndex dest,
                             RegIndex op1, uint64_t imm)
    {
        if (notSigned) {
            return decodeNeonUTwoShiftSReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, imm);
        } else {
            return decodeNeonSTwoShiftSReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1, imm);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoShiftXReg(bool q, unsigned size, ExtMachInst machInst,
                            RegIndex dest, RegIndex op1, uint64_t imm)
    {
        if (q) {
            return decodeNeonUTwoShiftUReg<BaseQ>(
                size, machInst, dest, op1, imm);
        } else {
            return decodeNeonUTwoShiftUSReg<BaseD>(
                size, machInst, dest, op1, imm);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSTwoShiftXReg(bool q, unsigned size, ExtMachInst machInst,
                            RegIndex dest, RegIndex op1, uint64_t imm)
    {
        if (q) {
            return decodeNeonSTwoShiftUReg<BaseQ>(
                size, machInst, dest, op1, imm);
        } else {
            return decodeNeonSTwoShiftUSReg<BaseD>(
                size, machInst, dest, op1, imm);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUTwoShiftUFpReg(unsigned size, ExtMachInst machInst,
                              RegIndex dest, RegIndex op1, uint64_t imm)
    {
        switch (size) {
          case 0b11:
            return new Base<uint64_t>(machInst, dest, op1, imm);
          case 0b10:
            return new Base<uint32_t>(machInst, dest, op1, imm);
          case 0b01:
            return new Base<uint16_t>(machInst, dest, op1, imm);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoShiftFpReg(bool q, unsigned size, ExtMachInst machInst,
                             RegIndex dest, RegIndex op1, uint64_t imm)
    {
        if (q) {
            switch (size) {
              case 0b11:
                return new BaseQ<uint64_t>(machInst, dest, op1, imm);
              case 0b10:
                return new BaseQ<uint32_t>(machInst, dest, op1, imm);
              case 0b01:
                return new BaseQ<uint16_t>(machInst, dest, op1, imm);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0b10:
                return new BaseD<uint32_t>(machInst, dest, op1, imm);
              case 0b01:
                return new BaseD<uint16_t>(machInst, dest, op1, imm);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUTwoMiscUSReg(unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1)
    {
        switch (size) {
          case 0:
            return new Base<uint8_t>(machInst, dest, op1);
          case 1:
            return new Base<uint16_t>(machInst, dest, op1);
          case 2:
            return new Base<uint32_t>(machInst, dest, op1);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSTwoMiscUSReg(unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1)
    {
        switch (size) {
          case 0:
            return new Base<int8_t>(machInst, dest, op1);
          case 1:
            return new Base<int16_t>(machInst, dest, op1);
          case 2:
            return new Base<int32_t>(machInst, dest, op1);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoMiscSReg(bool q, unsigned size,
                           ExtMachInst machInst, RegIndex dest,
                           RegIndex op1)
    {
        if (q) {
            return decodeNeonUTwoMiscUSReg<BaseQ>(size, machInst, dest, op1);
        } else {
            return decodeNeonUTwoMiscUSReg<BaseD>(size, machInst, dest, op1);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSTwoMiscSReg(bool q, unsigned size,
                           ExtMachInst machInst, RegIndex dest,
                           RegIndex op1)
    {
        if (q) {
            return decodeNeonSTwoMiscUSReg<BaseQ>(size, machInst, dest, op1);
        } else {
            return decodeNeonSTwoMiscUSReg<BaseD>(size, machInst, dest, op1);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUTwoMiscUReg(unsigned size,
                           ExtMachInst machInst, RegIndex dest,
                           RegIndex op1)
    {
        switch (size) {
          case 0:
            return new Base<uint8_t>(machInst, dest, op1);
          case 1:
            return new Base<uint16_t>(machInst, dest, op1);
          case 2:
            return new Base<uint32_t>(machInst, dest, op1);
          case 3:
            return new Base<uint64_t>(machInst, dest, op1);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonSTwoMiscUReg(unsigned size,
                           ExtMachInst machInst, RegIndex dest,
                           RegIndex op1)
    {
        switch (size) {
          case 0:
            return new Base<int8_t>(machInst, dest, op1);
          case 1:
            return new Base<int16_t>(machInst, dest, op1);
          case 2:
            return new Base<int32_t>(machInst, dest, op1);
          case 3:
            return new Base<int64_t>(machInst, dest, op1);
          default:
            return new Unknown(machInst);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSTwoMiscReg(bool q, unsigned size,
                          ExtMachInst machInst, RegIndex dest,
                          RegIndex op1)
    {
        if (q) {
            return decodeNeonSTwoMiscUReg<BaseQ>(size, machInst, dest, op1);
        } else {
            return decodeNeonSTwoMiscUReg<BaseD>(size, machInst, dest, op1);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoMiscReg(bool q, unsigned size,
                          ExtMachInst machInst, RegIndex dest,
                          RegIndex op1)
    {
        if (q) {
            return decodeNeonUTwoMiscUReg<BaseQ>(size, machInst, dest, op1);
        } else {
            return decodeNeonUTwoMiscUReg<BaseD>(size, machInst, dest, op1);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUSTwoMiscSReg(bool q, bool notSigned, unsigned size,
                            ExtMachInst machInst, RegIndex dest,
                            RegIndex op1)
    {
        if (notSigned) {
            return decodeNeonUTwoShiftSReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1);
        } else {
            return decodeNeonSTwoShiftSReg<BaseD, BaseQ>(
                    q, size, machInst, dest, op1);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoMiscXReg(bool q, unsigned size, ExtMachInst machInst,
                           RegIndex dest, RegIndex op1)
    {
        if (q) {
            return decodeNeonUTwoMiscUReg<BaseQ>(size, machInst, dest, op1);
        } else {
            return decodeNeonUTwoMiscUSReg<BaseD>(size, machInst, dest, op1);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSTwoMiscXReg(bool q, unsigned size, ExtMachInst machInst,
                           RegIndex dest, RegIndex op1)
    {
        if (q) {
            return decodeNeonSTwoMiscUReg<BaseQ>(size, machInst, dest, op1);
        } else {
            return decodeNeonSTwoMiscUSReg<BaseD>(size, machInst, dest, op1);
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoMiscFpReg(bool q, unsigned size, ExtMachInst machInst,
                            RegIndex dest, RegIndex op1)
    {
        if (q) {
            if (size)
                return new BaseQ<uint64_t>(machInst, dest, op1);
            else
                return new BaseQ<uint32_t>(machInst, dest, op1);
        } else {
            if (size)
                return new Unknown(machInst);
            else
                return new BaseD<uint32_t>(machInst, dest, op1);
        }
    }

    template <template <typename T> class BaseS,
              template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUTwoMiscPwiseScFpReg(unsigned u, unsigned size,
                                   ExtMachInst machInst,
                                   RegIndex dest, RegIndex op1)
    {
        if (u) {
            if (size)
                return new BaseQ<uint64_t>(machInst, dest, op1);
            else
                return new BaseD<uint32_t>(machInst, dest, op1);
        } else {
            if (size)
                return new Unknown64(machInst);
            else
                return new BaseS<uint16_t>(machInst, dest, op1);
        }
    }

    template <template <typename T> class Base>
    StaticInstPtr
    decodeNeonUTwoMiscScFpReg(unsigned size, ExtMachInst machInst,
                              RegIndex dest, RegIndex op1)
    {
        if (size)
            return new Base<uint64_t>(machInst, dest, op1);
        else
            return new Base<uint32_t>(machInst, dest, op1);
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonUAcrossLanesReg(bool q, unsigned size, ExtMachInst machInst,
                              RegIndex dest, RegIndex op1)
    {
        if (q) {
            switch (size) {
              case 0x0:
                return new BaseQ<uint8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseQ<uint16_t>(machInst, dest, op1);
              case 0x2:
                return new BaseQ<uint32_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0x0:
                return new BaseD<uint8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseD<uint16_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ,
              template <typename T> class BaseBQ>
    StaticInstPtr
    decodeNeonUAcrossLanesReg(bool q, unsigned size, ExtMachInst machInst,
                              RegIndex dest, RegIndex op1)
    {
        if (q) {
            switch (size) {
              case 0x0:
                return new BaseQ<uint8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseQ<uint16_t>(machInst, dest, op1);
              case 0x2:
                return new BaseBQ<uint32_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0x0:
                return new BaseD<uint8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseD<uint16_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ>
    StaticInstPtr
    decodeNeonSAcrossLanesReg(bool q, unsigned size, ExtMachInst machInst,
                              RegIndex dest, RegIndex op1)
    {
        if (q) {
            switch (size) {
              case 0x0:
                return new BaseQ<int8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseQ<int16_t>(machInst, dest, op1);
              case 0x2:
                return new BaseQ<int32_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0x0:
                return new BaseD<int8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseD<int16_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ,
              template <typename T> class BaseBQ>
    StaticInstPtr
    decodeNeonUAcrossLanesLongReg(bool q, unsigned size, ExtMachInst machInst,
                                  RegIndex dest, RegIndex op1)
    {
        if (q) {
            switch (size) {
              case 0x0:
                return new BaseQ<uint8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseQ<uint16_t>(machInst, dest, op1);
              case 0x2:
                return new BaseBQ<uint32_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0x0:
                return new BaseD<uint8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseD<uint16_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        }
    }

    template <template <typename T> class BaseD,
              template <typename T> class BaseQ,
              template <typename T> class BaseBQ>
    StaticInstPtr
    decodeNeonSAcrossLanesLongReg(bool q, unsigned size, ExtMachInst machInst,
                                  RegIndex dest, RegIndex op1)
    {
        if (q) {
            switch (size) {
              case 0x0:
                return new BaseQ<int8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseQ<int16_t>(machInst, dest, op1);
              case 0x2:
                return new BaseBQ<int32_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        } else {
            switch (size) {
              case 0x0:
                return new BaseD<int8_t>(machInst, dest, op1);
              case 0x1:
                return new BaseD<int16_t>(machInst, dest, op1);
              default:
                return new Unknown(machInst);
            }
        }
    }
}};

let {{
    header_output = ""
    exec_output = ""

    vcompares = '''
    static float
    vcgtFunc(float op1, float op2)
    {
        if (std::isnan(op1) || std::isnan(op2))
            return 2.0;
        return (op1 > op2) ? 0.0 : 1.0;
    }

    static float
    vcgeFunc(float op1, float op2)
    {
        if (std::isnan(op1) || std::isnan(op2))
            return 2.0;
        return (op1 >= op2) ? 0.0 : 1.0;
    }

    static float
    vceqFunc(float op1, float op2)
    {
        if (isSnan(op1) || isSnan(op2))
            return 2.0;
        return (op1 == op2) ? 0.0 : 1.0;
    }
'''
    vcomparesL = '''
    static float
    vcleFunc(float op1, float op2)
    {
        if (std::isnan(op1) || std::isnan(op2))
            return 2.0;
        return (op1 <= op2) ? 0.0 : 1.0;
    }

    static float
    vcltFunc(float op1, float op2)
    {
        if (std::isnan(op1) || std::isnan(op2))
            return 2.0;
        return (op1 < op2) ? 0.0 : 1.0;
    }
'''
    vacomparesG = '''
    static float
    vacgtFunc(float op1, float op2)
    {
        if (std::isnan(op1) || std::isnan(op2))
            return 2.0;
        return (fabsf(op1) > fabsf(op2)) ? 0.0 : 1.0;
    }

    static float
    vacgeFunc(float op1, float op2)
    {
        if (std::isnan(op1) || std::isnan(op2))
            return 2.0;
        return (fabsf(op1) >= fabsf(op2)) ? 0.0 : 1.0;
    }
'''

    exec_output += vcompares + vacomparesG

    smallUnsignedTypes = ("uint8_t", "uint16_t", "uint32_t")
    unsignedTypes = smallUnsignedTypes + ("uint64_t",)
    smallSignedTypes = ("int8_t", "int16_t", "int32_t")
    signedTypes = smallSignedTypes + ("int64_t",)
    smallTypes = smallUnsignedTypes + smallSignedTypes
    allTypes = unsignedTypes + signedTypes

    def threeEqualRegInst(name, Name, opClass, types, rCount, op,
                          readDest=False, pairwise=False, byElem=False,
                          standardFpcsr=False, complex=False, extra=''):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
                    RegVect srcReg1, destReg;
                    '''
        if byElem:
            # 2nd register operand has to be read fully
            eWalkCode += '''
                FullRegVect srcReg2;
                '''
        else:
            eWalkCode += '''
            RegVect srcReg2;
            '''
        for reg in range(rCount):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
                srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        if byElem:
            # 2nd operand has to be read fully
            for reg in range(rCount, 4):
                eWalkCode += '''
        srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
        ''' % { "reg" : reg }

        readDestCode = ''
        if standardFpcsr:
            eWalkCode += '''
            FPSCR fpscr = fpStandardFPSCRValue((FPSCR)FpscrExc);
            '''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'

        if complex:
            eWalkCode += op
        elif pairwise:
            eWalkCode += '''
            for (unsigned i = 0; i < eCount; i++) {
                Element srcElem1 = letoh(2 * i < eCount ?
                                        srcReg1.elements[2 * i] :
                                        srcReg2.elements[2 * i - eCount]);
                Element srcElem2 = letoh(2 * i < eCount ?
                                        srcReg1.elements[2 * i + 1] :
                                        srcReg2.elements[2 * i + 1 - eCount]);
                Element destElem;
                %(readDest)s
                %(op)s
                destReg.elements[i] = htole(destElem);
            }
            ''' % { "op" : op, "readDest" : readDestCode }
        else:
            eWalkCode += extra
            eWalkCode += '''
            for (unsigned i = 0; i < eCount; i++) {
                Element srcElem1 = letoh(srcReg1.elements[i]);
                Element srcElem2 = letoh(srcReg2.elements[i]);
                Element destElem;
                %(readDest)s
                %(op)s
                destReg.elements[i] = htole(destElem);
            }
            ''' % { "op" : op, "readDest" : readDestCode }
        if standardFpcsr:
            eWalkCode += '''
            FpscrExc = fpscr;
            '''
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegRegImmOp" if byElem else "RegRegRegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        if byElem:
            header_output += NeonRegRegRegImmOpDeclare.subst(iop)
        else:
            header_output += NeonRegRegRegOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def threeEqualRegInstFp(name, Name, opClass, types, rCount, op,
                            readDest=False, pairwise=False, toInt=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        typedef float FloatVect[rCount];
        FloatVect srcRegs1, srcRegs2;
        '''
        if toInt:
            eWalkCode += 'RegVect destRegs;\n'
        else:
            eWalkCode += 'FloatVect destRegs;\n'
        for reg in range(rCount):
            eWalkCode += '''
                srcRegs1[%(reg)d] = FpOp1P%(reg)d;
                srcRegs2[%(reg)d] = FpOp2P%(reg)d;
            ''' % { "reg" : reg }
            if readDest:
                if toInt:
                    eWalkCode += '''
                        destRegs.regs[%(reg)d] = FpDestP%(reg)d.bits;
                    ''' % { "reg" : reg }
                else:
                    eWalkCode += '''
                        destRegs[%(reg)d] = FpDestP%(reg)d;
                    ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destReg = destRegs[r];'
        destType = 'float'
        writeDest = 'destRegs[r] = destReg;'
        if toInt:
            destType = 'uint32_t'
            writeDest = 'destRegs.regs[r] = destReg;'
        if pairwise:
            eWalkCode += '''
            for (unsigned r = 0; r < rCount; r++) {
                float srcReg1 = (2 * r < rCount) ?
                    srcRegs1[2 * r] : srcRegs2[2 * r - rCount];
                float srcReg2 = (2 * r < rCount) ?
                    srcRegs1[2 * r + 1] : srcRegs2[2 * r + 1 - rCount];
                %(destType)s destReg;
                %(readDest)s
                %(op)s
                %(writeDest)s
            }
            ''' % { "op" : op,
                    "readDest" : readDestCode,
                    "destType" : destType,
                    "writeDest" : writeDest }
        else:
            eWalkCode += '''
            for (unsigned r = 0; r < rCount; r++) {
                float srcReg1 = srcRegs1[r];
                float srcReg2 = srcRegs2[r];
                %(destType)s destReg;
                %(readDest)s
                %(op)s
                %(writeDest)s
            }
            ''' % { "op" : op,
                    "readDest" : readDestCode,
                    "destType" : destType,
                    "writeDest" : writeDest }
        for reg in range(rCount):
            if toInt:
                eWalkCode += '''
                FpDestP%(reg)d_uw = destRegs.regs[%(reg)d];
                ''' % { "reg" : reg }
            else:
                eWalkCode += '''
                FpDestP%(reg)d = destRegs[%(reg)d];
                ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "FpRegRegRegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegRegOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def threeUnequalRegInst(name, Name, opClass, types, op,
                            bigSrc1, bigSrc2, bigDest, readDest, short=False,
                            extra_check=''):
        global header_output, exec_output
        src1Cnt = src2Cnt = destCnt = 1 if short else 2
        src1Prefix = src2Prefix = destPrefix = ''
        if bigSrc1:
            src1Cnt = 2 if short else 4
            src1Prefix = 'Big'
        if bigSrc2:
            src2Cnt = 2 if short else 4
            src2Prefix = 'Big'
        if bigDest:
            destCnt = 2 if short else 4
            destPrefix = 'Big'
        eWalkCode = simdEnabledCheckCode + extra_check + '''
            %sRegVect srcReg1;
            %sRegVect srcReg2;
            %sRegVect destReg;
        ''' % (src1Prefix, src2Prefix, destPrefix)
        for reg in range(src1Cnt):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
        for reg in range(src2Cnt):
            eWalkCode += '''
                srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
            ''' % { "reg" : reg }
        if readDest:
            for reg in range(destCnt):
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            %(src1Prefix)sElement srcElem1 = letoh(srcReg1.elements[i]);
            %(src1Prefix)sElement srcElem2 = letoh(srcReg2.elements[i]);
            %(destPrefix)sElement destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode,
                "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
                "destPrefix" : destPrefix }
        for reg in range(destCnt):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegRegOp",
                               { "code": eWalkCode,
                                 "r_count": 1 if short else 2,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegRegOpDeclare.subst(iop)
        exec_output += NeonUnequalRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def threeRegNarrowInst(name, Name, opClass, types, op, readDest=False,
                           extra_check=''):
        threeUnequalRegInst(name, Name, opClass, types, op,
                            True, True, False, readDest, False, extra_check)

    def threeRegLongInst(name, Name, opClass, types, op, readDest=False,
                         short=False, extra_check=''):
        threeUnequalRegInst(name, Name, opClass, types, op,
                            False, False, True, readDest, short, extra_check)

    def threeRegWideInst(name, Name, opClass, types, op, readDest=False,
                         extra_check=''):
        threeUnequalRegInst(name, Name, opClass, types, op,
                            True, False, True, readDest, False, extra_check)

    def twoEqualRegInst(name, Name, opClass, types, rCount, op,
                        readDest=False, extra=''):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcReg1, srcReg2, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
                srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += extra
        eWalkCode += '''
        if (imm >= eCount) {
            return std::make_shared<UndefinedInstruction>(machInst, false,
                                                          mnemonic);
        } else {
            for (unsigned i = 0; i < eCount; i++) {
                Element srcElem1 = letoh(srcReg1.elements[i]);
                Element srcElem2 = letoh(srcReg2.elements[imm]);
                Element destElem;
                %(readDest)s
                %(op)s
                destReg.elements[i] = htole(destElem);
            }
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegRegImmOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegLongInst(name, Name, opClass, types, op, readDest=False,
                       short=False, extra_check=''):
        global header_output, exec_output
        rCount = 1 if short else 2
        eWalkCode = simdEnabledCheckCode + extra_check + '''
        RegVect srcReg1, srcReg2;
        BigRegVect destReg = {};
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
                srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);;
            ''' % { "reg" : reg }
        if readDest:
            for reg in range(2 * rCount):
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        if (imm >= eCount) {
            fault = std::make_shared<UndefinedInstruction>(machInst, false,
                                                          mnemonic);
        } else {
            for (unsigned i = 0; i < eCount; i++) {
                Element srcElem1 = letoh(srcReg1.elements[i]);
                Element srcElem2 = letoh(srcReg2.elements[imm]);
                BigElement destElem;
                %(readDest)s
                %(op)s
                destReg.elements[i] = htole(destElem);
            }
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(2 * rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegRegImmOpDeclare.subst(iop)
        exec_output += NeonUnequalRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoEqualRegInstFp(name, Name, opClass, types, rCount, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        typedef float FloatVect[rCount];
        FloatVect srcRegs1, srcRegs2, destRegs;
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcRegs1[%(reg)d] = FpOp1P%(reg)d;
                srcRegs2[%(reg)d] = FpOp2P%(reg)d;
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                    destRegs[%(reg)d] = FpDestP%(reg)d;
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destReg = destRegs[i];'
        eWalkCode += '''
        if (imm >= eCount) {
            return std::make_shared<UndefinedInstruction>(machInst, false,
                                                          mnemonic);
        } else {
            for (unsigned i = 0; i < rCount; i++) {
                float srcReg1 = srcRegs1[i];
                float srcReg2 = srcRegs2[imm];
                float destReg;
                %(readDest)s
                %(op)s
                destRegs[i] = destReg;
            }
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d = destRegs[%(reg)d];
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "FpRegRegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegRegImmOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegShiftInst(name, Name, opClass, types, rCount, op,
            readDest=False, toInt=False, fromInt=False, readSrcElem=True):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcRegs1, destRegs;
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcRegs1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                    destRegs.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destRegs.elements[i]);'
            if toInt:
                readDestCode = 'destReg = letoh(destRegs.regs[i]);'
        readOpCode = 'Element srcElem1 = letoh(srcRegs1.elements[i]);'
        if fromInt:
            readOpCode = 'uint32_t srcReg1 = letoh(srcRegs1.regs[i]);'
        declDest = 'Element destElem;'
        writeDestCode = 'destRegs.elements[i] = htole(destElem);'
        if toInt:
            declDest = 'uint32_t destReg;'
            writeDestCode = 'destRegs.regs[i] = htole(destReg);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            %(readOp)s
            %(declDest)s
            %(readDest)s
            %(op)s
            %(writeDest)s
        }
        ''' % { "readOp" : readOpCode if readSrcElem else "",
                "declDest" : declDest,
                "readDest" : readDestCode,
                "op" : op,
                "writeDest" : writeDestCode }
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destRegs.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegImmOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegNarrowShiftInst(name, Name, opClass, types, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        BigRegVect srcReg1;
        RegVect destReg;
        '''
        for reg in range(4):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
        if readDest:
            for reg in range(2):
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            BigElement srcElem1 = letoh(srcReg1.elements[i]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(2):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": 2,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegImmOpDeclare.subst(iop)
        exec_output += NeonUnequalRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegLongShiftInst(name, Name, opClass, types, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcReg1;
        BigRegVect destReg = {};
        '''
        for reg in range(2):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
        if readDest:
            for reg in range(4):
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destReg = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            Element srcElem1 = letoh(srcReg1.elements[i]);
            BigElement destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(4):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": 2,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegImmOpDeclare.subst(iop)
        exec_output += NeonUnequalRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegMiscInst(name, Name, opClass, types, rCount, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcReg1, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            unsigned j = i;
            Element srcElem1 = letoh(srcReg1.elements[i]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[j] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegMiscScInst(name, Name, opClass, types, rCount, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcReg1, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            Element srcElem1 = letoh(srcReg1.elements[imm]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegImmOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegMiscScramble(name, Name, opClass, types, rCount, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcReg1, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
                destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += op
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            FpOp1P%(reg)d_uw = letoh(srcReg1.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegMiscInstFp(name, Name, opClass, types, rCount, op,
            readDest=False, toInt=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        typedef float FloatVect[rCount];
        FloatVect srcRegs1;
        '''
        if toInt:
            eWalkCode += 'RegVect destRegs;\n'
        else:
            eWalkCode += 'FloatVect destRegs;\n'
        for reg in range(rCount):
            eWalkCode += '''
                srcRegs1[%(reg)d] = FpOp1P%(reg)d;
            ''' % { "reg" : reg }
            if readDest:
                if toInt:
                    eWalkCode += '''
                        destRegs.regs[%(reg)d] = FpDestP%(reg)d.bits;
                    ''' % { "reg" : reg }
                else:
                    eWalkCode += '''
                        destRegs[%(reg)d] = FpDestP%(reg)d;
                    ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destReg = destRegs[i];'
        destType = 'float'
        writeDest = 'destRegs[r] = destReg;'
        if toInt:
            destType = 'uint32_t'
            writeDest = 'destRegs.regs[r] = destReg;'
        eWalkCode += '''
        for (unsigned r = 0; r < rCount; r++) {
            float srcReg1 = srcRegs1[r];
            %(destType)s destReg;
            %(readDest)s
            %(op)s
            %(writeDest)s
        }
        ''' % { "op" : op,
                "readDest" : readDestCode,
                "destType" : destType,
                "writeDest" : writeDest }
        for reg in range(rCount):
            if toInt:
                eWalkCode += '''
                FpDestP%(reg)d_uw = destRegs.regs[%(reg)d];
                ''' % { "reg" : reg }
            else:
                eWalkCode += '''
                FpDestP%(reg)d = destRegs[%(reg)d];
                ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "FpRegRegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegCondenseInst(name, Name, opClass, types, rCount, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcRegs;
        BigRegVect destReg = {};
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcRegs.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount / 2; i++) {
            Element srcElem1 = letoh(srcRegs.elements[2 * i]);
            Element srcElem2 = letoh(srcRegs.elements[2 * i + 1]);
            BigElement destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegOpDeclare.subst(iop)
        exec_output += NeonUnequalRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegNarrowMiscInst(name, Name, opClass, types, op, readDest=False,
                             extra_check=''):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + extra_check + '''
        BigRegVect srcReg1;
        RegVect destReg;
        '''
        for reg in range(4):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
        if readDest:
            for reg in range(2):
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            BigElement srcElem1 = letoh(srcReg1.elements[i]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(2):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegOp",
                               { "code": eWalkCode,
                                 "r_count": 2,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegOpDeclare.subst(iop)
        exec_output += NeonUnequalRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def oneRegImmInst(name, Name, opClass, types, rCount, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect destReg;
        '''
        if readDest:
            for reg in range(rCount):
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegImmOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    def twoRegLongMiscInst(name, Name, opClass, types, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcReg1;
        BigRegVect destReg = {};
        '''
        for reg in range(2):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
            ''' % { "reg" : reg }
        if readDest:
            for reg in range(4):
                eWalkCode += '''
                    destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destReg = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            Element srcElem1 = letoh(srcReg1.elements[i]);
            BigElement destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(4):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegOp",
                               { "code": eWalkCode,
                                 "r_count": 2,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegOpDeclare.subst(iop)
        exec_output += NeonUnequalRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    vhaddCode = '''
        Element carryBit =
            (((unsigned)srcElem1 & 0x1) +
             ((unsigned)srcElem2 & 0x1)) >> 1;
        // Use division instead of a shift to ensure the sign extension works
        // right. The compiler will figure out if it can be a shift. Mask the
        // inputs so they get truncated correctly.
        destElem = (((srcElem1 & ~(Element)1) / 2) +
                    ((srcElem2 & ~(Element)1) / 2)) + carryBit;
    '''
    threeEqualRegInst("vhadd", "VhaddD", "SimdAddOp", allTypes, 2, vhaddCode)
    threeEqualRegInst("vhadd", "VhaddQ", "SimdAddOp", allTypes, 4, vhaddCode)

    vrhaddCode = '''
        Element carryBit =
            (((unsigned)srcElem1 & 0x1) +
             ((unsigned)srcElem2 & 0x1) + 1) >> 1;
        // Use division instead of a shift to ensure the sign extension works
        // right. The compiler will figure out if it can be a shift. Mask the
        // inputs so they get truncated correctly.
        destElem = (((srcElem1 & ~(Element)1) / 2) +
                    ((srcElem2 & ~(Element)1) / 2)) + carryBit;
    '''
    threeEqualRegInst("vrhadd", "VrhaddD", "SimdAddOp", allTypes, 2, vrhaddCode)
    threeEqualRegInst("vrhadd", "VrhaddQ", "SimdAddOp", allTypes, 4, vrhaddCode)

    vhsubCode = '''
        Element barrowBit =
            (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
        // Use division instead of a shift to ensure the sign extension works
        // right. The compiler will figure out if it can be a shift. Mask the
        // inputs so they get truncated correctly.
        destElem = (((srcElem1 & ~(Element)1) / 2) -
                    ((srcElem2 & ~(Element)1) / 2)) - barrowBit;
    '''
    threeEqualRegInst("vhsub", "VhsubD", "SimdAddOp", allTypes, 2, vhsubCode)
    threeEqualRegInst("vhsub", "VhsubQ", "SimdAddOp", allTypes, 4, vhsubCode)

    vandCode = '''
        destElem = srcElem1 & srcElem2;
    '''
    threeEqualRegInst("vand", "VandD", "SimdAluOp", unsignedTypes, 2, vandCode)
    threeEqualRegInst("vand", "VandQ", "SimdAluOp", unsignedTypes, 4, vandCode)

    vbicCode = '''
        destElem = srcElem1 & ~srcElem2;
    '''
    threeEqualRegInst("vbic", "VbicD", "SimdAluOp", unsignedTypes, 2, vbicCode)
    threeEqualRegInst("vbic", "VbicQ", "SimdAluOp", unsignedTypes, 4, vbicCode)

    vorrCode = '''
        destElem = srcElem1 | srcElem2;
    '''
    threeEqualRegInst("vorr", "VorrD", "SimdAluOp", unsignedTypes, 2, vorrCode)
    threeEqualRegInst("vorr", "VorrQ", "SimdAluOp", unsignedTypes, 4, vorrCode)

    threeEqualRegInst("vmov", "VmovD", "SimdMiscOp", unsignedTypes, 2, vorrCode)
    threeEqualRegInst("vmov", "VmovQ", "SimdMiscOp", unsignedTypes, 4, vorrCode)

    vornCode = '''
        destElem = srcElem1 | ~srcElem2;
    '''
    threeEqualRegInst("vorn", "VornD", "SimdAluOp", unsignedTypes, 2, vornCode)
    threeEqualRegInst("vorn", "VornQ", "SimdAluOp", unsignedTypes, 4, vornCode)

    veorCode = '''
        destElem = srcElem1 ^ srcElem2;
    '''
    threeEqualRegInst("veor", "VeorD", "SimdAluOp", unsignedTypes, 2, veorCode)
    threeEqualRegInst("veor", "VeorQ", "SimdAluOp", unsignedTypes, 4, veorCode)

    vbifCode = '''
        destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);
    '''
    threeEqualRegInst("vbif", "VbifD", "SimdAluOp", unsignedTypes, 2, vbifCode, True)
    threeEqualRegInst("vbif", "VbifQ", "SimdAluOp", unsignedTypes, 4, vbifCode, True)
    vbitCode = '''
        destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);
    '''
    threeEqualRegInst("vbit", "VbitD", "SimdAluOp", unsignedTypes, 2, vbitCode, True)
    threeEqualRegInst("vbit", "VbitQ", "SimdAluOp", unsignedTypes, 4, vbitCode, True)
    vbslCode = '''
        destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);
    '''
    threeEqualRegInst("vbsl", "VbslD", "SimdAluOp", unsignedTypes, 2, vbslCode, True)
    threeEqualRegInst("vbsl", "VbslQ", "SimdAluOp", unsignedTypes, 4, vbslCode, True)

    vmaxCode = '''
        destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;
    '''
    threeEqualRegInst("vmax", "VmaxD", "SimdCmpOp", allTypes, 2, vmaxCode)
    threeEqualRegInst("vmax", "VmaxQ", "SimdCmpOp", allTypes, 4, vmaxCode)

    vminCode = '''
        destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;
    '''
    threeEqualRegInst("vmin", "VminD", "SimdCmpOp", allTypes, 2, vminCode)
    threeEqualRegInst("vmin", "VminQ", "SimdCmpOp", allTypes, 4, vminCode)

    vaddCode = '''
        destElem = srcElem1 + srcElem2;
    '''
    threeEqualRegInst("vadd", "NVaddD", "SimdAddOp", unsignedTypes, 2, vaddCode)
    threeEqualRegInst("vadd", "NVaddQ", "SimdAddOp", unsignedTypes, 4, vaddCode)

    threeEqualRegInst("vpadd", "NVpaddD", "SimdAddOp", smallUnsignedTypes,
                      2, vaddCode, pairwise=True)
    vaddlwCode = '''
        destElem = (BigElement)srcElem1 + (BigElement)srcElem2;
    '''
    threeRegLongInst("vaddl", "Vaddl", "SimdAddOp", smallTypes, vaddlwCode)
    threeRegWideInst("vaddw", "Vaddw", "SimdAddOp", smallTypes, vaddlwCode)
    vaddhnCode = '''
        destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
                   (sizeof(Element) * 8);
    '''
    threeRegNarrowInst("vaddhn", "Vaddhn", "SimdAddOp", smallTypes, vaddhnCode)
    vraddhnCode = '''
        destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
                    ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
                   (sizeof(Element) * 8);
    '''
    threeRegNarrowInst("vraddhn", "Vraddhn", "SimdAddOp", smallTypes, vraddhnCode)

    vsubCode = '''
        destElem = srcElem1 - srcElem2;
    '''
    threeEqualRegInst("vsub", "NVsubD", "SimdAddOp", unsignedTypes, 2, vsubCode)
    threeEqualRegInst("vsub", "NVsubQ", "SimdAddOp", unsignedTypes, 4, vsubCode)
    vsublwCode = '''
        destElem = (BigElement)srcElem1 - (BigElement)srcElem2;
    '''
    threeRegLongInst("vsubl", "Vsubl", "SimdAddOp", smallTypes, vsublwCode)
    threeRegWideInst("vsubw", "Vsubw", "SimdAddOp", smallTypes, vsublwCode)

    vqaddUCode = '''
        destElem = srcElem1 + srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (destElem < srcElem1 || destElem < srcElem2) {
            destElem = (Element)(-1);
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqadd", "VqaddUD", "SimdAddOp", unsignedTypes, 2, vqaddUCode)
    threeEqualRegInst("vqadd", "VqaddUQ", "SimdAddOp", unsignedTypes, 4, vqaddUCode)
    vsubhnCode = '''
        destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
                   (sizeof(Element) * 8);
    '''
    threeRegNarrowInst("vsubhn", "Vsubhn", "SimdAddOp", smallTypes, vsubhnCode)
    vrsubhnCode = '''
        destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
                    ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
                   (sizeof(Element) * 8);
    '''
    threeRegNarrowInst("vrsubhn", "Vrsubhn", "SimdAddOp", smallTypes, vrsubhnCode)

    vcaddCode = '''
        bool rot = bits(machInst, 24);
        Element el1;
        Element el3;

        for (int i = 0; i < eCount/2; ++i) {
            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
            Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
            Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
            Element destElem_1;
            Element destElem_2;
            if (rot) {
                el1 = srcElem2_2;
                el3 = fplibNeg<Element>(srcElem2_1);
            } else {
                el1 = fplibNeg<Element>(srcElem2_2);
                el3 = srcElem2_1;
            }

            destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
            destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);
            destReg.elements[2*i] = htole(destElem_1);
            destReg.elements[2*i+1] = htole(destElem_2);
         }
         '''

    # VCADD
    threeEqualRegInst("vcadd", "VcaddD", "SimdFloatAddOp",
                            ("uint16_t", "uint32_t"), 2, vcaddCode,
                            standardFpcsr=True, complex=True)
    threeEqualRegInst("vcadd", "VcaddQ", "SimdFloatAddOp",
                            ("uint16_t", "uint32_t"), 4,
                           vcaddCode, standardFpcsr=True, complex=True)

    vcmlaCode = '''
        uint8_t rot = bits(machInst, %(rot)s);
        Element el1;
        Element el2;
        Element el3;
        Element el4;
        for (int i = 0; i < eCount/2; ++i) {

            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
            Element srcElem2_1 = letoh(srcReg2.elements[2*%(index)s]);
            Element srcElem2_2 = letoh(srcReg2.elements[2*%(index)s+1]);
            Element destElem_1 = letoh(destReg.elements[2*i]);
            Element destElem_2 = letoh(destReg.elements[2*i+1]);

            switch (rot) {
              case 0x0:
                {
                  el1 = srcElem2_1;
                  el2 = srcElem1_1;
                  el3 = srcElem2_2;
                  el4 = srcElem1_1;
                  break;
                }
              case 0x1:
                {
                  el1 = fplibNeg<Element>(srcElem2_2);
                  el2 = srcElem1_2;
                  el3 = srcElem2_1;
                  el4 = srcElem1_2;
                  break;
                }
              case 0x2:
                {
                  el1 = fplibNeg<Element>(srcElem2_1);
                  el2 = srcElem1_1;
                  el3 = fplibNeg<Element>(srcElem2_2);
                  el4 = srcElem1_1;
                  break;
                }
              case 0x3:
                {
                  el1 = srcElem2_2;
                  el2 = srcElem1_2;
                  el3 = fplibNeg<Element>(srcElem2_1);
                  el4 = srcElem1_2;
                  break;
                }
            }

            destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
            destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);

            destReg.elements[2*i] = htole(destElem_1);
            destReg.elements[2*i+1] = htole(destElem_2);
         }
         '''

    # VCMLA (by element)
    vcmla_imm = vcmlaCode % {'rot': '21, 20', 'index': 'imm'}
    threeEqualRegInst("vcmla", "VcmlaElemD", "SimdFloatMultAccOp",
                           ("uint16_t", "uint32_t"), 2, vcmla_imm,
                           readDest=True, byElem=True, standardFpcsr=True,
                           complex=True)
    threeEqualRegInst("vcmla", "VcmlaElemQ", "SimdFloatMultAccOp",
                           ("uint16_t", "uint32_t"), 4, vcmla_imm,
                           readDest=True, byElem=True, standardFpcsr=True,
                           complex=True)

    # FCMLA (vector)
    vcmla_vec = vcmlaCode % {'rot': '24, 23', 'index': 'i'}
    threeEqualRegInst("vcmla", "VcmlaD", "SimdFloatMultAccOp",
                            ("uint16_t", "uint32_t"), 2, vcmla_vec,
                             readDest=True, standardFpcsr=True, complex=True)
    threeEqualRegInst("vcmla", "VcmlaQ", "SimdFloatMultAccOp",
                            ("uint16_t", "uint32_t"), 4, vcmla_vec,
                             readDest=True, standardFpcsr=True, complex=True)

    vqaddSCode = '''
        destElem = srcElem1 + srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        bool negDest = (destElem < 0);
        bool negSrc1 = (srcElem1 < 0);
        bool negSrc2 = (srcElem2 < 0);
        if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
            if (negDest)
                /* If (>=0) plus (>=0) yields (<0), saturate to +. */
                destElem = std::numeric_limits<Element>::max();
            else
                /* If (<0) plus (<0) yields (>=0), saturate to -. */
                destElem = std::numeric_limits<Element>::min();
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqadd", "VqaddSD", "SimdAddOp", signedTypes, 2, vqaddSCode)
    threeEqualRegInst("vqadd", "VqaddSQ", "SimdAddOp", signedTypes, 4, vqaddSCode)

    vqsubUCode = '''
        destElem = srcElem1 - srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (destElem > srcElem1) {
            destElem = 0;
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqsub", "VqsubUD", "SimdAddOp", unsignedTypes, 2, vqsubUCode)
    threeEqualRegInst("vqsub", "VqsubUQ", "SimdAddOp", unsignedTypes, 4, vqsubUCode)

    vqsubSCode = '''
        destElem = srcElem1 - srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        bool negDest = (destElem < 0);
        bool negSrc1 = (srcElem1 < 0);
        bool posSrc2 = (srcElem2 >= 0);
        if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
            if (negDest)
                /* If (>=0) minus (<0) yields (<0), saturate to +. */
                destElem = std::numeric_limits<Element>::max();
            else
                /* If (<0) minus (>=0) yields (>=0), saturate to -. */
                destElem = std::numeric_limits<Element>::min();
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqsub", "VqsubSD", "SimdAddOp", signedTypes, 2, vqsubSCode)
    threeEqualRegInst("vqsub", "VqsubSQ", "SimdAddOp", signedTypes, 4, vqsubSCode)

    vcgtCode = '''
        destElem =  (srcElem1 > srcElem2) ? (Element)(-1) : 0;
    '''
    threeEqualRegInst("vcgt", "VcgtD", "SimdCmpOp", allTypes, 2, vcgtCode)
    threeEqualRegInst("vcgt", "VcgtQ", "SimdCmpOp", allTypes, 4, vcgtCode)

    vcgeCode = '''
        destElem =  (srcElem1 >= srcElem2) ? (Element)(-1) : 0;
    '''
    threeEqualRegInst("vcge", "VcgeD", "SimdCmpOp", allTypes, 2, vcgeCode)
    threeEqualRegInst("vcge", "VcgeQ", "SimdCmpOp", allTypes, 4, vcgeCode)

    vceqCode = '''
        destElem =  (srcElem1 == srcElem2) ? (Element)(-1) : 0;
    '''
    threeEqualRegInst("vceq", "VceqD", "SimdCmpOp", unsignedTypes, 2, vceqCode)
    threeEqualRegInst("vceq", "VceqQ", "SimdCmpOp", unsignedTypes, 4, vceqCode)

    vshlCode = '''
        int16_t shiftAmt = (int8_t)srcElem2;
        if (shiftAmt < 0) {
            shiftAmt = -shiftAmt;
            if (shiftAmt >= sizeof(Element) * 8) {
                shiftAmt = sizeof(Element) * 8 - 1;
                destElem = 0;
            } else {
                destElem = (srcElem1 >> shiftAmt);
            }
            // Make sure the right shift sign extended when it should.
            if (ltz(srcElem1) && !ltz(destElem)) {
                destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                             1 - shiftAmt));
            }
        } else {
            if (shiftAmt >= sizeof(Element) * 8) {
                destElem = 0;
            } else {
                destElem = srcElem1 << shiftAmt;
            }
        }
    '''
    threeEqualRegInst("vshl", "VshlD", "SimdShiftOp", allTypes, 2, vshlCode)
    threeEqualRegInst("vshl", "VshlQ", "SimdShiftOp", allTypes, 4, vshlCode)

    vrshlCode = '''
        int16_t shiftAmt = (int8_t)srcElem2;
        if (shiftAmt < 0) {
            shiftAmt = -shiftAmt;
            Element rBit = 0;
            if (shiftAmt <= sizeof(Element) * 8)
                rBit = bits(srcElem1, shiftAmt - 1);
            if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
                rBit = 1;
            if (shiftAmt >= sizeof(Element) * 8) {
                shiftAmt = sizeof(Element) * 8 - 1;
                destElem = 0;
            } else {
                destElem = (srcElem1 >> shiftAmt);
            }
            // Make sure the right shift sign extended when it should.
            if (ltz(srcElem1) && !ltz(destElem)) {
                destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                             1 - shiftAmt));
            }
            destElem += rBit;
        } else if (shiftAmt > 0) {
            if (shiftAmt >= sizeof(Element) * 8) {
                destElem = 0;
            } else {
                destElem = srcElem1 << shiftAmt;
            }
        } else {
            destElem = srcElem1;
        }
    '''
    threeEqualRegInst("vrshl", "VrshlD", "SimdAluOp", allTypes, 2, vrshlCode)
    threeEqualRegInst("vrshl", "VrshlQ", "SimdAluOp", allTypes, 4, vrshlCode)

    vqshlUCode = '''
        int16_t shiftAmt = (int8_t)srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (shiftAmt < 0) {
            shiftAmt = -shiftAmt;
            if (shiftAmt >= sizeof(Element) * 8) {
                shiftAmt = sizeof(Element) * 8 - 1;
                destElem = 0;
            } else {
                destElem = (srcElem1 >> shiftAmt);
            }
        } else if (shiftAmt > 0) {
            if (shiftAmt >= sizeof(Element) * 8) {
                if (srcElem1 != 0) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = 0;
                }
            } else {
                if (bits(srcElem1, sizeof(Element) * 8 - 1,
                            sizeof(Element) * 8 - shiftAmt)) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = srcElem1 << shiftAmt;
                }
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqshl", "VqshlUD", "SimdAluOp", unsignedTypes, 2, vqshlUCode)
    threeEqualRegInst("vqshl", "VqshlUQ", "SimdAluOp", unsignedTypes, 4, vqshlUCode)

    vqshlSCode = '''
        int16_t shiftAmt = (int8_t)srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (shiftAmt < 0) {
            shiftAmt = -shiftAmt;
            if (shiftAmt >= sizeof(Element) * 8) {
                shiftAmt = sizeof(Element) * 8 - 1;
                destElem = 0;
            } else {
                destElem = (srcElem1 >> shiftAmt);
            }
            // Make sure the right shift sign extended when it should.
            if (srcElem1 < 0 && destElem >= 0) {
                destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                             1 - shiftAmt));
            }
        } else if (shiftAmt > 0) {
            bool sat = false;
            if (shiftAmt >= sizeof(Element) * 8) {
                if (srcElem1 != 0)
                    sat = true;
                else
                    destElem = 0;
            } else {
                if (bits(srcElem1, sizeof(Element) * 8 - 1,
                            sizeof(Element) * 8 - 1 - shiftAmt) !=
                        ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
                    sat = true;
                } else {
                    destElem = srcElem1 << shiftAmt;
                }
            }
            if (sat) {
                fpscr.qc = 1;
                destElem = mask(sizeof(Element) * 8 - 1);
                if (srcElem1 < 0)
                    destElem = ~destElem;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqshl", "VqshlSD", "SimdCmpOp", signedTypes, 2, vqshlSCode)
    threeEqualRegInst("vqshl", "VqshlSQ", "SimdCmpOp", signedTypes, 4, vqshlSCode)

    vqrshlUCode = '''
        int16_t shiftAmt = (int8_t)srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (shiftAmt < 0) {
            shiftAmt = -shiftAmt;
            Element rBit = 0;
            if (shiftAmt <= sizeof(Element) * 8)
                rBit = bits(srcElem1, shiftAmt - 1);
            if (shiftAmt >= sizeof(Element) * 8) {
                shiftAmt = sizeof(Element) * 8 - 1;
                destElem = 0;
            } else {
                destElem = (srcElem1 >> shiftAmt);
            }
            destElem += rBit;
        } else if (shiftAmt == 0) {
            destElem = srcElem1;
        } else {
            if (shiftAmt >= sizeof(Element) * 8) {
                if (srcElem1 != 0) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = 0;
                }
            } else {
                if (bits(srcElem1, sizeof(Element) * 8 - 1,
                            sizeof(Element) * 8 - shiftAmt)) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = srcElem1 << shiftAmt;
                }
            }
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqrshl", "VqrshlUD", "SimdCmpOp", unsignedTypes, 2, vqrshlUCode)
    threeEqualRegInst("vqrshl", "VqrshlUQ", "SimdCmpOp", unsignedTypes, 4, vqrshlUCode)

    vqrshlSCode = '''
        int16_t shiftAmt = (int8_t)srcElem2;
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (shiftAmt < 0) {
            shiftAmt = -shiftAmt;
            Element rBit = 0;
            if (shiftAmt <= sizeof(Element) * 8)
                rBit = bits(srcElem1, shiftAmt - 1);
            if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
                rBit = 1;
            if (shiftAmt >= sizeof(Element) * 8) {
                shiftAmt = sizeof(Element) * 8 - 1;
                destElem = 0;
            } else {
                destElem = (srcElem1 >> shiftAmt);
            }
            // Make sure the right shift sign extended when it should.
            if (srcElem1 < 0 && destElem >= 0) {
                destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                             1 - shiftAmt));
            }
            destElem += rBit;
        } else if (shiftAmt > 0) {
            bool sat = false;
            if (shiftAmt >= sizeof(Element) * 8) {
                if (srcElem1 != 0)
                    sat = true;
                else
                    destElem = 0;
            } else {
                if (bits(srcElem1, sizeof(Element) * 8 - 1,
                            sizeof(Element) * 8 - 1 - shiftAmt) !=
                        ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
                    sat = true;
                } else {
                    destElem = srcElem1 << shiftAmt;
                }
            }
            if (sat) {
                fpscr.qc = 1;
                destElem = mask(sizeof(Element) * 8 - 1);
                if (srcElem1 < 0)
                    destElem = ~destElem;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqrshl", "VqrshlSD", "SimdCmpOp", signedTypes, 2, vqrshlSCode)
    threeEqualRegInst("vqrshl", "VqrshlSQ", "SimdCmpOp", signedTypes, 4, vqrshlSCode)

    vabaCode = '''
        destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
                                            (srcElem2 - srcElem1);
    '''
    threeEqualRegInst("vaba", "VabaD", "SimdAddAccOp", allTypes, 2, vabaCode, True)
    threeEqualRegInst("vaba", "VabaQ", "SimdAddAccOp", allTypes, 4, vabaCode, True)
    vabalCode = '''
        destElem += (srcElem1 > srcElem2) ?
            ((BigElement)srcElem1 - (BigElement)srcElem2) :
            ((BigElement)srcElem2 - (BigElement)srcElem1);
    '''
    threeRegLongInst("vabal", "Vabal", "SimdAddAccOp", smallTypes, vabalCode, True)

    vabdCode = '''
        destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
                                           (srcElem2 - srcElem1);
    '''
    threeEqualRegInst("vabd", "VabdD", "SimdAddOp", allTypes, 2, vabdCode)
    threeEqualRegInst("vabd", "VabdQ", "SimdAddOp", allTypes, 4, vabdCode)
    vabdlCode = '''
        destElem = (srcElem1 > srcElem2) ?
            ((BigElement)srcElem1 - (BigElement)srcElem2) :
            ((BigElement)srcElem2 - (BigElement)srcElem1);
    '''
    threeRegLongInst("vabdl", "Vabdl", "SimdAddOp", smallTypes, vabdlCode)

    vtstCode = '''
        destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;
    '''
    threeEqualRegInst("vtst", "VtstD", "SimdAluOp", unsignedTypes, 2, vtstCode)
    threeEqualRegInst("vtst", "VtstQ", "SimdAluOp", unsignedTypes, 4, vtstCode)

    vmulCode = '''
        destElem = srcElem1 * srcElem2;
    '''
    threeEqualRegInst("vmul", "NVmulD", "SimdMultOp", allTypes, 2, vmulCode)
    threeEqualRegInst("vmul", "NVmulQ", "SimdMultOp", allTypes, 4, vmulCode)
    vmullCode = '''
        destElem = (BigElement)srcElem1 * (BigElement)srcElem2;
    '''
    threeRegLongInst("vmull", "Vmull", "SimdMultOp", smallTypes, vmullCode)

    vmlaCode = '''
        destElem = destElem + srcElem1 * srcElem2;
    '''
    threeEqualRegInst("vmla", "NVmlaD", "SimdMultAccOp", allTypes, 2, vmlaCode, True)
    threeEqualRegInst("vmla", "NVmlaQ", "SimdMultAccOp", allTypes, 4, vmlaCode, True)
    vmlalCode = '''
        destElem = destElem + (BigElement)srcElem1 * (BigElement)srcElem2;
    '''
    threeRegLongInst("vmlal", "Vmlal", "SimdMultAccOp", smallTypes, vmlalCode, True)

    vqdmlalCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
        Element maxNeg = std::numeric_limits<Element>::min();
        Element halfNeg = maxNeg / 2;
        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
            fpscr.qc = 1;
        }
        bool negPreDest = ltz(destElem);
        destElem += midElem;
        bool negDest = ltz(destElem);
        bool negMid = ltz(midElem);
        if (negPreDest == negMid && negMid != negDest) {
            destElem = mask(sizeof(BigElement) * 8 - 1);
            if (negPreDest)
                destElem = ~destElem;
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeRegLongInst("vqdmlal", "Vqdmlal", "SimdMultAccOp", smallTypes, vqdmlalCode, True)

    vqdmlslCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
        Element maxNeg = std::numeric_limits<Element>::min();
        Element halfNeg = maxNeg / 2;
        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
            fpscr.qc = 1;
        }
        bool negPreDest = ltz(destElem);
        destElem -= midElem;
        bool negDest = ltz(destElem);
        bool posMid = ltz((BigElement)-midElem);
        if (negPreDest == posMid && posMid != negDest) {
            destElem = mask(sizeof(BigElement) * 8 - 1);
            if (negPreDest)
                destElem = ~destElem;
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeRegLongInst("vqdmlsl", "Vqdmlsl", "SimdMultAccOp", smallTypes, vqdmlslCode, True)

    vqdmullCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
        if (srcElem1 == srcElem2 &&
                srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
            destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeRegLongInst("vqdmull", "Vqdmull", "SimdMultAccOp", smallTypes, vqdmullCode)

    vmlsCode = '''
        destElem = destElem - srcElem1 * srcElem2;
    '''
    threeEqualRegInst("vmls", "NVmlsD", "SimdMultAccOp", allTypes, 2, vmlsCode, True)
    threeEqualRegInst("vmls", "NVmlsQ", "SimdMultAccOp", allTypes, 4, vmlsCode, True)
    vmlslCode = '''
        destElem = destElem - (BigElement)srcElem1 * (BigElement)srcElem2;
    '''
    threeRegLongInst("vmlsl", "Vmlsl", "SimdMultAccOp", smallTypes, vmlslCode, True)

    vmulpCode = '''
        destElem = 0;
        for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
            if (bits(srcElem2, j))
                destElem ^= srcElem1 << j;
        }
    '''
    threeEqualRegInst("vmul", "NVmulpD", "SimdMultOp", unsignedTypes, 2, vmulpCode)
    threeEqualRegInst("vmul", "NVmulpQ", "SimdMultOp", unsignedTypes, 4, vmulpCode)
    vmullpCode = '''
        destElem = 0;
        for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
            if (bits(srcElem2, j))
                destElem ^= (BigElement)srcElem1 << j;
        }
    '''
    threeRegLongInst("vmull", "Vmullp", "SimdMultOp", smallUnsignedTypes, vmullpCode)

    threeEqualRegInst("vpmax", "VpmaxD", "SimdCmpOp", smallTypes, 2, vmaxCode, pairwise=True)

    threeEqualRegInst("vpmin", "VpminD", "SimdCmpOp", smallTypes, 2, vminCode, pairwise=True)

    vqdmulhCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
                   (sizeof(Element) * 8);
        if (srcElem1 == srcElem2 &&
                srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
            destElem = ~srcElem1;
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqdmulh", "VqdmulhD", "SimdMultOp", smallSignedTypes, 2, vqdmulhCode)
    threeEqualRegInst("vqdmulh", "VqdmulhQ", "SimdMultOp", smallSignedTypes, 4, vqdmulhCode)


    vqrdmCode = '''
          FPSCR fpscr = (FPSCR) FpscrQc;
          int nbits = sizeof(Element)*8;

          auto val_max = std::numeric_limits<Element>::max();
          auto val_min = std::numeric_limits<Element>::min();
          BigElement unsat_value = ((BigElement)destElem << nbits) %(code)s
                ((BigElement)srcElem1 * (BigElement)srcElem2 * 2) +
                ((BigElement)1 << (nbits - 1));
          unsat_value >>= nbits;

          if (unsat_value > val_max) {
              fpscr.qc = 1;
              destElem = val_max;
          } else if (unsat_value < val_min) {
              fpscr.qc = 1;
              destElem = val_min;
          } else {
              destElem = unsat_value;
          }
          FpscrQc = fpscr;
    '''
    code_add = "+"
    vqrdmlahCode = vqrdmCode % {'code': code_add}
    rdm_check = '''
      int sz = bits(machInst, 21, 20);
      RegVal isar5 = xc->tcBase()->readMiscReg(MISCREG_ID_ISAR5);
      if (!(bits(isar5, 27, 24) == 0x1) || sz == 3 || sz == 0)
          return std::make_shared<UndefinedInstruction>(machInst, true);
      typedef __int128_t BigElement;
    '''
    threeEqualRegInst("vqrdmlah", "VqrdmlahD",
            "SimdMultOp", smallSignedTypes, 2, vqrdmlahCode, readDest=True,
            extra=rdm_check)
    threeEqualRegInst("vqrdmlah", "VqrdmlahQ",
            "SimdMultOp", smallSignedTypes, 4, vqrdmlahCode, readDest=True,
            extra=rdm_check)

    code_sub = "-"
    vqrdmlshCode = vqrdmCode % {'code': code_sub}
    threeEqualRegInst("vqrdmlsh", "VqrdmlshD",
            "SimdMultOp", smallSignedTypes, 2, vqrdmlshCode, readDest=True,
            extra=rdm_check)
    threeEqualRegInst("vqrdmlsh", "VqrdmlshQ",
            "SimdMultOp", smallSignedTypes, 4, vqrdmlshCode, readDest=True,
            extra=rdm_check)


    vqrdmulhCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
                    ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
                   (sizeof(Element) * 8);
        Element maxNeg = std::numeric_limits<Element>::min();
        Element halfNeg = maxNeg / 2;
        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
            if (destElem < 0) {
                destElem = mask(sizeof(Element) * 8 - 1);
            } else {
                destElem = std::numeric_limits<Element>::min();
            }
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeEqualRegInst("vqrdmulh", "VqrdmulhD",
            "SimdMultOp", smallSignedTypes, 2, vqrdmulhCode)
    threeEqualRegInst("vqrdmulh", "VqrdmulhQ",
            "SimdMultOp", smallSignedTypes, 4, vqrdmulhCode)

    vMinMaxFpCode = '''
        destElem = fplib%s<Element>(srcElem1, srcElem2, fpscr);
    '''
    vMinMaxInsts = [
        ("vmax",   "VmaxDFp",   2, "Max",    False, ),
        ("vmax",   "VmaxQFp",   4, "Max",    False, ),
        ("vmaxnm", "VmaxnmDFp", 2, "MaxNum", False, ),
        ("vmaxnm", "VmaxnmQFp", 4, "MaxNum", False, ),
        ("vpmax",  "VpmaxDFp",  2, "Max",    True,  ),
        ("vpmax",  "VpmaxQFp",  4, "Max",    True,  ),
        ("vmin",   "VminDFp",   2, "Min",    False, ),
        ("vmin",   "VminQFp",   4, "Min",    False, ),
        ("vminnm", "VminnmDFp", 2, "MinNum", False, ),
        ("vminnm", "VminnmQFp", 4, "MinNum", False, ),
        ("vpmin",  "VpminDFp",  2, "Min",    True,  ),
        ("vpmin",  "VpminQFp",  4, "Min",    True,  ),
    ]
    for name, Name, rCount, op, pairwise in vMinMaxInsts:
        threeEqualRegInst(
            name,
            Name,
            "SimdFloatCmpOp",
            ("uint16_t", "uint32_t",),
            rCount,
            vMinMaxFpCode % op,
            pairwise=pairwise,
            standardFpcsr=True,
        )

    vaddfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destReg = binaryOp(fpscr, srcReg1, srcReg2, fpAddS,
                           true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vadd", "VaddDFp", "SimdFloatAddOp", ("float",), 2, vaddfpCode)
    threeEqualRegInstFp("vadd", "VaddQFp", "SimdFloatAddOp", ("float",), 4, vaddfpCode)

    threeEqualRegInstFp("vpadd", "VpaddDFp", "SimdFloatAddOp", ("float",),
                        2, vaddfpCode, pairwise=True)
    threeEqualRegInstFp("vpadd", "VpaddQFp", "SimdFloatAddOp", ("float",),
                        4, vaddfpCode, pairwise=True)

    vaddfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibAdd(srcElem1, srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vadd", "VaddDFpH", "SimdFloatAddOp", ("uint16_t",),
            2, vaddfpHCode)
    threeEqualRegInst("vadd", "VaddQFpH", "SimdFloatAddOp", ("uint16_t",),
            4, vaddfpHCode)

    threeEqualRegInst("vpadd", "VpaddDFpH", "SimdFloatAddOp", ("uint16_t",),
                        2, vaddfpHCode, pairwise=True)
    threeEqualRegInst("vpadd", "VpaddQFpH", "SimdFloatAddOp", ("uint16_t",),
                        4, vaddfpHCode, pairwise=True)

    vsubfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destReg = binaryOp(fpscr, srcReg1, srcReg2, fpSubS,
                           true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vsub", "VsubDFp", "SimdFloatAddOp", ("float",), 2, vsubfpCode)
    threeEqualRegInstFp("vsub", "VsubQFp", "SimdFloatAddOp", ("float",), 4, vsubfpCode)

    vsubfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibSub(srcElem1, srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vsub", "VsubDFpH", "SimdFloatAddOp", ("uint16_t",),
            2, vsubfpHCode)
    threeEqualRegInst("vsub", "VsubQFpH", "SimdFloatAddOp", ("uint16_t",),
            4, vsubfpHCode)

    vmulfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destReg = binaryOp(fpscr, srcReg1, srcReg2, fpMulS,
                           true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vmul", "NVmulDFp", "SimdFloatMultOp", ("float",), 2, vmulfpCode)
    threeEqualRegInstFp("vmul", "NVmulQFp", "SimdFloatMultOp", ("float",), 4, vmulfpCode)

    vmulfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibMul(srcElem1, srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vmul", "NVmulDFpH", "SimdFloatMultOp", ("uint16_t",),
            2, vmulfpHCode)
    threeEqualRegInst("vmul", "NVmulQFpH", "SimdFloatMultOp", ("uint16_t",),
            4, vmulfpHCode)

    vmlafpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float mid = binaryOp(fpscr, srcReg1, srcReg2, fpMulS,
                             true, true, VfpRoundNearest);
        destReg = binaryOp(fpscr, mid, destReg, fpAddS,
                           true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vmla", "NVmlaDFp", "SimdFloatMultAccOp", ("float",), 2, vmlafpCode, True)
    threeEqualRegInstFp("vmla", "NVmlaQFp", "SimdFloatMultAccOp", ("float",), 4, vmlafpCode, True)

    vmlafpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibAdd(
            destElem, fplibMul(srcElem1, srcElem2, fpscr), fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vmla", "NVmlaDFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      2, vmlafpHCode, True)
    threeEqualRegInst("vmla", "NVmlaQFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      4, vmlafpHCode, True)

    vfmafpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destReg = ternaryOp(fpscr, srcReg1, srcReg2, destReg, fpMulAdd<float>,
                            true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vfma", "NVfmaDFp", "SimdFloatMultAccOp", ("float",), 2, vfmafpCode, True)
    threeEqualRegInstFp("vfma", "NVfmaQFp", "SimdFloatMultAccOp", ("float",), 4, vfmafpCode, True)

    vfmafpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibMulAdd(destElem, srcElem1, srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vfma", "NVfmaDFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      2, vfmafpHCode, True)
    threeEqualRegInst("vfma", "NVfmaQFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      4, vfmafpHCode, True)

    fhm_check = '''
      RegVal isar6 = xc->tcBase()->readMiscReg(MISCREG_ID_ISAR6);
      if (!(bits(isar6, 11, 8) == 0x1))
          return std::make_shared<UndefinedInstruction>(machInst, true);
    '''
    vfmalfpCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibMulAddH(destElem, srcElem1, srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeRegLongInst("vfmal", "NVfmalD", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmalfpCode, True, short=True, extra_check=fhm_check)
    threeRegLongInst("vfmal", "NVfmalQ", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmalfpCode, True, extra_check=fhm_check)
    twoRegLongInst("vfmal", "NVfmalElemD", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmalfpCode, True, short=True, extra_check=fhm_check)
    twoRegLongInst("vfmal", "NVfmalElemQ", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmalfpCode, True, extra_check=fhm_check)

    vfmsfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destReg = ternaryOp(fpscr, -srcReg1, srcReg2, destReg, fpMulAdd<float>,
                            true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vfms", "NVfmsDFp", "SimdFloatMultAccOp", ("float",), 2, vfmsfpCode, True)
    threeEqualRegInstFp("vfms", "NVfmsQFp", "SimdFloatMultAccOp", ("float",), 4, vfmsfpCode, True)

    vfmsfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibMulAdd(destElem, fplibNeg(srcElem1), srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vfms", "NVfmsDFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      2, vfmsfpHCode, True)
    threeEqualRegInst("vfms", "NVfmsQFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      4, vfmsfpHCode, True)

    vfmslfpCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibMulAddH(destElem, fplibNeg(srcElem1), srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeRegLongInst("vfmsl", "NVfmslD", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmslfpCode, True, short=True, extra_check=fhm_check)
    threeRegLongInst("vfmsl", "NVfmslQ", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmslfpCode, True, extra_check=fhm_check)
    twoRegLongInst("vfmsl", "NVfmslElemD", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmslfpCode, True, short=True, extra_check=fhm_check)
    twoRegLongInst("vfmsl", "NVfmslElemQ", "SimdFloatMultAccOp", ("uint16_t",),
                     vfmslfpCode, True, extra_check=fhm_check)

    bf16_check = '''
      RegVal isar6 = xc->tcBase()->readMiscReg(MISCREG_ID_ISAR6);
      if (!(bits(isar6, 23, 20) >= 0x1))
          return std::make_shared<UndefinedInstruction>(machInst, true);
    '''
    vfmabtElemBfCode = '''
        int sel = %(sel)s;
        int immsel = bits(machInst, 3);
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);

        uint32_t src1_elem = (uint16_t)(srcElem1 >> (16 * sel));
        uint32_t src2_elem = (uint16_t)(srcElem2 >> (16 * immsel));
        destElem = fplibMulAdd(destElem,
            (uint32_t)src1_elem << 16, (uint32_t)src2_elem << 16, fpscr);

        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoEqualRegInst("vfmab", "VfmabElemQ", "SimdBf16MultAccOp",
        ("uint32_t",), 4, vfmabtElemBfCode % {"sel": "0"},
        True, extra=bf16_check)
    twoEqualRegInst("vfmat", "VfmatElemQ", "SimdBf16MultAccOp",
        ("uint32_t",), 4, vfmabtElemBfCode % {"sel": "1"},
        True, extra=bf16_check)

    vfmabtBfCode = '''
        int sel = %(sel)s;
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);

        uint32_t src1_elem = (uint16_t)(srcElem1 >> (16 * sel));
        uint32_t src2_elem = (uint16_t)(srcElem2 >> (16 * sel));
        destElem = fplibMulAdd(destElem,
            (uint32_t)src1_elem << 16, (uint32_t)src2_elem << 16, fpscr);

        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vfmab", "VfmabQ", "SimdBf16MultAccOp", ("uint32_t",), 4,
        vfmabtBfCode % {"sel": "0"}, True, extra=bf16_check)
    threeEqualRegInst("vfmat", "VfmatQ", "SimdBf16MultAccOp", ("uint32_t",), 4,
        vfmabtBfCode % {"sel": "1"}, True, extra=bf16_check)

    vmlsfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float mid = binaryOp(fpscr, srcReg1, srcReg2, fpMulS,
                             true, true, VfpRoundNearest);
        destReg = binaryOp(fpscr, destReg, mid, fpSubS,
                           true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vmls", "NVmlsDFp", "SimdFloatMultAccOp", ("float",), 2, vmlsfpCode, True)
    threeEqualRegInstFp("vmls", "NVmlsQFp", "SimdFloatMultAccOp", ("float",), 4, vmlsfpCode, True)

    vmlsfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibAdd(
            destElem, fplibNeg(fplibMul(srcElem1, srcElem2, fpscr)), fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vmls", "NVmlsDFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      2, vmlsfpHCode, True)
    threeEqualRegInst("vmls", "NVmlsQFpH", "SimdFloatMultAccOp", ("uint16_t",),
                      4, vmlsfpHCode, True)

    vcgtfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, srcReg2, vcgtFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vcgt", "VcgtDFp", "SimdFloatCmpOp", ("float",),
            2, vcgtfpCode, toInt = True)
    threeEqualRegInstFp("vcgt", "VcgtQFp", "SimdFloatCmpOp", ("float",),
            4, vcgtfpCode, toInt = True)

    vcgtfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGT(srcElem1, srcElem2, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vcgt", "VcgtDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vcgtfpHCode)
    threeEqualRegInst("vcgt", "VcgtQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vcgtfpHCode)

    vcgefpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, srcReg2, vcgeFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vcge", "VcgeDFp", "SimdFloatCmpOp", ("float",),
            2, vcgefpCode, toInt = True)
    threeEqualRegInstFp("vcge", "VcgeQFp", "SimdFloatCmpOp", ("float",),
            4, vcgefpCode, toInt = True)

    vcgefpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGE(srcElem1, srcElem2, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vcge", "VcgeDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vcgefpHCode)
    threeEqualRegInst("vcge", "VcgeQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vcgefpHCode)

    vacgtfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, srcReg2, vacgtFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vacgt", "VacgtDFp", "SimdFloatCmpOp", ("float",),
            2, vacgtfpCode, toInt = True)
    threeEqualRegInstFp("vacgt", "VacgtQFp", "SimdFloatCmpOp", ("float",),
            4, vacgtfpCode, toInt = True)

    vacgtfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGT(
            fplibAbs(srcElem1), fplibAbs(srcElem2), fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vacgt", "VacgtDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vacgtfpHCode)
    threeEqualRegInst("vacgt", "VacgtQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vacgtfpHCode)

    vacgefpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, srcReg2, vacgeFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vacge", "VacgeDFp", "SimdFloatCmpOp", ("float",),
            2, vacgefpCode, toInt = True)
    threeEqualRegInstFp("vacge", "VacgeQFp", "SimdFloatCmpOp", ("float",),
            4, vacgefpCode, toInt = True)

    vacgefpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGE(
            fplibAbs(srcElem1), fplibAbs(srcElem2), fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vacge", "VacgeDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vacgefpHCode)
    threeEqualRegInst("vacge", "VacgeQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vacgefpHCode)

    vceqfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, srcReg2, vceqFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vceq", "VceqDFp", "SimdFloatCmpOp", ("float",),
            2, vceqfpCode, toInt = True)
    threeEqualRegInstFp("vceq", "VceqQFp", "SimdFloatCmpOp", ("float",),
            4, vceqfpCode, toInt = True)

    vceqfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareEQ(srcElem1, srcElem2, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vceq", "VceqDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vceqfpHCode)
    threeEqualRegInst("vceq", "VceqQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vceqfpHCode)

    vrecpsCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destReg = binaryOp(fpscr, srcReg1, srcReg2, fpRecpsS,
                           true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vrecps", "VrecpsDFp", "SimdFloatMultAccOp", ("float",), 2, vrecpsCode)
    threeEqualRegInstFp("vrecps", "VrecpsQFp", "SimdFloatMultAccOp", ("float",), 4, vrecpsCode)

    vrecpsFpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplib32RecipStep(srcElem1, srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vrecps", "VrecpsDFpH", "SimdFloatMultAccOp",
                      ("uint16_t",), 2, vrecpsFpHCode)
    threeEqualRegInst("vrecps", "VrecpsQFpH", "SimdFloatMultAccOp",
                      ("uint16_t",), 4, vrecpsFpHCode)

    vrsqrtsCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destReg = binaryOp(fpscr, srcReg1, srcReg2, fpRSqrtsS,
                           true, true, VfpRoundNearest);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vrsqrts", "VrsqrtsDFp", "SimdFloatMiscOp", ("float",), 2, vrsqrtsCode)
    threeEqualRegInstFp("vrsqrts", "VrsqrtsQFp", "SimdFloatMiscOp", ("float",), 4, vrsqrtsCode)

    vrsqrtsFpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplib32RSqrtStep(srcElem1, srcElem2, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vrsqrts", "VrsqrtsDFpH", "SimdFloatMiscOp",
                      ("uint16_t",), 2, vrsqrtsFpHCode)
    threeEqualRegInst("vrsqrts", "VrsqrtsQFpH", "SimdFloatMiscOp",
                      ("uint16_t",), 4, vrsqrtsFpHCode)

    vabdfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float mid = binaryOp(fpscr, srcReg1, srcReg2, fpSubS,
                             true, true, VfpRoundNearest);
        destReg = fabsf(mid);
        FpscrExc = fpscr;
    '''
    threeEqualRegInstFp("vabd", "VabdDFp", "SimdFloatAddOp", ("float",), 2, vabdfpCode)
    threeEqualRegInstFp("vabd", "VabdQFp", "SimdFloatAddOp", ("float",), 4, vabdfpCode)

    vabdfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr));
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    threeEqualRegInst("vabd", "VabdDFpH", "SimdFloatAddOp", ("uint16_t",), 2,
                      vabdfpHCode)
    threeEqualRegInst("vabd", "VabdQFpH", "SimdFloatAddOp", ("uint16_t",), 4,
                      vabdfpHCode)

    twoEqualRegInst("vmla", "VmlasD", "SimdMultAccOp", unsignedTypes, 2,
                    vmlaCode, True)
    twoEqualRegInst("vmla", "VmlasQ", "SimdMultAccOp", unsignedTypes, 4,
                    vmlaCode, True)
    twoEqualRegInstFp("vmla", "VmlasDFp", "SimdFloatMultAccOp", ("float",), 2,
                      vmlafpCode, True)
    twoEqualRegInstFp("vmla", "VmlasQFp", "SimdFloatMultAccOp", ("float",), 4,
                      vmlafpCode, True)
    twoEqualRegInst("vmla", "VmlasDFpH", "SimdFloatMultAccOp", ("uint16_t",),
                    2, vmlafpHCode, True)
    twoEqualRegInst("vmla", "VmlasQFpH", "SimdFloatMultAccOp", ("uint16_t",),
                    4, vmlafpHCode, True)
    twoRegLongInst("vmlal", "Vmlals", "SimdMultAccOp", smallTypes, vmlalCode,
                   True)

    twoEqualRegInst("vmls", "VmlssD", "SimdMultAccOp", allTypes, 2, vmlsCode, True)
    twoEqualRegInst("vmls", "VmlssQ", "SimdMultAccOp", allTypes, 4, vmlsCode, True)
    twoEqualRegInstFp("vmls", "VmlssDFp", "SimdFloatMultAccOp", ("float",), 2, vmlsfpCode, True)
    twoEqualRegInstFp("vmls", "VmlssQFp", "SimdFloatMultAccOp", ("float",), 4, vmlsfpCode, True)
    twoEqualRegInst("vmls", "VmlssDFpH", "SimdFloatMultAccOp", ("uint16_t",),
                    2, vmlsfpHCode, True)
    twoEqualRegInst("vmls", "VmlssQFpH", "SimdFloatMultAccOp", ("uint16_t",),
                    4, vmlsfpHCode, True)
    twoRegLongInst("vmlsl", "Vmlsls", "SimdMultAccOp", smallTypes, vmlslCode,
                   True)

    twoEqualRegInst("vmul", "VmulsD", "SimdMultOp", allTypes, 2, vmulCode)
    twoEqualRegInst("vmul", "VmulsQ", "SimdMultOp", allTypes, 4, vmulCode)
    twoEqualRegInstFp("vmul", "VmulsDFp", "SimdFloatMultOp", ("float",), 2, vmulfpCode)
    twoEqualRegInstFp("vmul", "VmulsQFp", "SimdFloatMultOp", ("float",), 4, vmulfpCode)
    twoEqualRegInst("vmul", "VmulsDFpH", "SimdFloatMultOp", ("uint16_t",),
                    2, vmulfpHCode)
    twoEqualRegInst("vmul", "VmulsQFpH", "SimdFloatMultOp", ("uint16_t",),
                    4, vmulfpHCode)
    twoRegLongInst("vmull", "Vmulls", "SimdMultOp", smallTypes, vmullCode)

    twoRegLongInst("vqdmull", "Vqdmulls", "SimdMultOp", smallTypes, vqdmullCode)
    twoRegLongInst("vqdmlal", "Vqdmlals", "SimdMultAccOp", smallTypes, vqdmlalCode, True)
    twoRegLongInst("vqdmlsl", "Vqdmlsls", "SimdMultAccOp", smallTypes, vqdmlslCode, True)
    twoEqualRegInst("vqdmulh", "VqdmulhsD", "SimdMultOp", smallSignedTypes, 2, vqdmulhCode)
    twoEqualRegInst("vqdmulh", "VqdmulhsQ", "SimdMultOp", smallSignedTypes, 4, vqdmulhCode)
    twoEqualRegInst("vqrdmulh", "VqrdmulhsD",
            "SimdMultOp", smallSignedTypes, 2, vqrdmulhCode)
    twoEqualRegInst("vqrdmulh", "VqrdmulhsQ",
            "SimdMultOp", smallSignedTypes, 4, vqrdmulhCode)
    twoEqualRegInst("vqrdmlah", "VqrdmlahsD",
            "SimdMultOp", smallSignedTypes, 2, vqrdmlahCode, readDest=True,
            extra=rdm_check)
    twoEqualRegInst("vqrdmlah", "VqrdmlahsQ",
            "SimdMultOp", smallSignedTypes, 4, vqrdmlahCode, readDest=True,
            extra=rdm_check)
    twoEqualRegInst("vqrdmlsh", "VqrdmlshsD",
            "SimdMultOp", smallSignedTypes, 2, vqrdmlshCode, readDest=True,
            extra=rdm_check)
    twoEqualRegInst("vqrdmlsh", "VqrdmlshsQ",
            "SimdMultOp", smallSignedTypes, 4, vqrdmlshCode, readDest=True,
            extra=rdm_check)

    vdotCode = '''
        int way = sizeof(Element) / sizeof(%(src1_type)s);

        Element res = destElem;
        for (int w = 0; w < way; w ++) {
            Element src1_elem = (%(src1_type)s)
                (srcElem1 >> (8 * sizeof(%(src1_type)s) * w));
            Element src2_elem = (%(src2_type)s)
                (srcElem2 >> (8 * sizeof(%(src2_type)s) * w));
            res = res + src1_elem * src2_elem;
        }
        destElem = res;
    '''
    twoEqualRegInst("vudot", "VudotElemD", "SimdMultOp", ("uint32_t",), 2,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "uint8_t"}, True)
    twoEqualRegInst("vudot", "VudotElemQ", "SimdMultOp", ("uint32_t",), 4,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "uint8_t"}, True)
    twoEqualRegInst("vsdot", "VsdotElemD", "SimdMultOp", ("int32_t",), 2,
        vdotCode % {"src1_type": "int8_t", "src2_type": "int8_t"}, True)
    twoEqualRegInst("vsdot", "VsdotElemQ", "SimdMultOp", ("int32_t",), 4,
        vdotCode % {"src1_type": "int8_t", "src2_type": "int8_t"}, True)

    threeEqualRegInst("vudot", "VudotD", "SimdMultOp", ("uint32_t",), 2,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "uint8_t"}, True)
    threeEqualRegInst("vudot", "VudotQ", "SimdMultOp", ("uint32_t",), 4,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "uint8_t"}, True)
    threeEqualRegInst("vsdot", "VsdotD", "SimdMultOp", ("int32_t",), 2,
        vdotCode % {"src1_type": "int8_t", "src2_type": "int8_t"}, True)
    threeEqualRegInst("vsdot", "VsdotQ", "SimdMultOp", ("int32_t",), 4,
        vdotCode % {"src1_type": "int8_t", "src2_type": "int8_t"}, True)

    twoEqualRegInst("vusdot", "VusdotElemD", "SimdMultOp", ("int32_t",), 2,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "int8_t"}, True)
    twoEqualRegInst("vusdot", "VusdotElemQ", "SimdMultOp", ("int32_t",), 4,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "int8_t"}, True)
    twoEqualRegInst("vsudot", "VsudotElemD", "SimdMultOp", ("int32_t",), 2,
        vdotCode % {"src1_type": "int8_t", "src2_type": "uint8_t"}, True)
    twoEqualRegInst("vsudot", "VsudotElemQ", "SimdMultOp", ("int32_t",), 4,
        vdotCode % {"src1_type": "int8_t", "src2_type": "uint8_t"}, True)

    threeEqualRegInst("vusdot", "VusdotD", "SimdMultOp", ("int32_t",), 2,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "int8_t"}, True)
    threeEqualRegInst("vusdot", "VusdotQ", "SimdMultOp", ("int32_t",), 4,
        vdotCode % {"src1_type": "uint8_t", "src2_type": "int8_t"}, True)

    vdotBfCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);

        uint32_t sum = 0;
        for (int w = 0; w < 2; w ++) {
            uint16_t src1_elem = (uint16_t)(srcElem1 >> (16 * w));
            uint16_t src2_elem = (uint16_t)(srcElem2 >> (16 * w));
            uint32_t product = fplibBfMulH(src1_elem, src2_elem, fpscr);
            sum = !w ? product : fplibAdd_Bf16(sum, product, fpscr);
        }
        destElem = fplibAdd_Bf16(destElem, sum, fpscr);
    '''
    twoEqualRegInst("vdot.bf16", "VdotElemBfD", "SimdBf16DotProdOp",
        ("uint32_t",), 2, vdotBfCode, True, extra=bf16_check)
    twoEqualRegInst("vdot.bf16", "VdotElemBfQ", "SimdBf16DotProdOp",
        ("uint32_t",), 4, vdotBfCode, True, extra=bf16_check)

    threeEqualRegInst("vdot.bf16", "VdotBfD", "SimdBf16DotProdOp",
        ("uint32_t",), 2, vdotBfCode, True, extra=bf16_check)
    threeEqualRegInst("vdot.bf16", "VdotBfQ", "SimdBf16DotProdOp",
        ("uint32_t",), 4, vdotBfCode, True, extra=bf16_check)

    vmmla8bCode = '''
        for (int i = 0; i < 2; i ++) {
            for (int j = 0; j < 2; j ++) {
                Element sum = letoh(destReg.elements[2 * i + j]);
                for (int e = 0; e < 2; e ++) {
                    Element srcElem1 = letoh(srcReg1.elements[2 * i + e]);
                    Element srcElem2 = letoh(srcReg2.elements[2 * j + e]);
                    for (int w = 0; w < 4; w ++) {
                        Element src1_elem = (%(src1_type)s)
                            (srcElem1 >> (8 * sizeof(%(src1_type)s) * w));
                        Element src2_elem = (%(src2_type)s)
                            (srcElem2 >> (8 * sizeof(%(src2_type)s) * w));
                        sum = sum + src1_elem * src2_elem;
                    }
                }
                destReg.elements[2 * i + j] = htole(sum);
            }
        }
    '''
    threeEqualRegInst("vummla", "VummlaQ", "SimdMultOp", ("uint32_t",), 4,
        vmmla8bCode % {"src1_type": "uint8_t", "src2_type": "uint8_t"},
        readDest=True, complex=True)
    threeEqualRegInst("vsmmla", "VsmmlaQ", "SimdMultOp", ("int32_t",), 4,
        vmmla8bCode % {"src1_type": "int8_t", "src2_type": "int8_t"},
        readDest=True, complex=True)
    threeEqualRegInst("vusmmla", "VusmmlaQ", "SimdMultOp", ("int32_t",), 4,
        vmmla8bCode % {"src1_type": "uint8_t", "src2_type": "int8_t"},
        readDest=True, complex=True)

    vmmlaBfCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);

        for (int i = 0; i < 2; i ++) {
            for (int j = 0; j < 2; j ++) {
                uint32_t sum = letoh(destReg.elements[2 * i + j]);
                for (int k = 0; k < 2; k ++) {
                    uint32_t srcElem1 = letoh(srcReg1.elements[2 * i + k]);
                    uint32_t srcElem2 = letoh(srcReg2.elements[2 * j + k]);

                    uint16_t elt1_a = (uint16_t)(srcElem1 >> (16 * 0));
                    uint16_t elt1_b = (uint16_t)(srcElem2 >> (16 * 0));
                    uint16_t elt2_a = (uint16_t)(srcElem1 >> (16 * 1));
                    uint16_t elt2_b = (uint16_t)(srcElem2 >> (16 * 1));

                    uint32_t product = fplibAdd_Bf16(
                        fplibBfMulH(elt1_a, elt1_b, fpscr),
                        fplibBfMulH(elt2_a, elt2_b, fpscr), fpscr);
                    sum = fplibAdd_Bf16(sum, product, fpscr);
                }
                destReg.elements[2 * i + j] = htole(sum);
            }
        }
    '''
    threeEqualRegInst("vmmla", "VmmlaBfQ", "SimdBf16MatMultAccOp",
        ("uint32_t",), 4, vmmlaBfCode, readDest=True, complex=True,
        extra=bf16_check)

    vshrCode = '''
        if (imm >= sizeof(srcElem1) * 8) {
            if (ltz(srcElem1))
                destElem = -1;
            else
                destElem = 0;
        } else {
            destElem = srcElem1 >> imm;
        }
    '''
    twoRegShiftInst("vshr", "NVshrD", "SimdShiftOp", allTypes, 2, vshrCode)
    twoRegShiftInst("vshr", "NVshrQ", "SimdShiftOp", allTypes, 4, vshrCode)

    vsraCode = '''
        Element mid;;
        if (imm >= sizeof(srcElem1) * 8) {
            mid = ltz(srcElem1) ? -1 : 0;
        } else {
            mid = srcElem1 >> imm;
            if (ltz(srcElem1) && !ltz(mid)) {
                mid |= -(mid & ((Element)1 <<
                            (sizeof(Element) * 8 - 1 - imm)));
            }
        }
        destElem += mid;
    '''
    twoRegShiftInst("vsra", "NVsraD", "SimdShiftAccOp", allTypes, 2, vsraCode, True)
    twoRegShiftInst("vsra", "NVsraQ", "SimdShiftAccOp", allTypes, 4, vsraCode, True)

    vrshrCode = '''
        if (imm > sizeof(srcElem1) * 8) {
            destElem = 0;
        } else if (imm) {
            Element rBit = bits(srcElem1, imm - 1);
            destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
        } else {
            destElem = srcElem1;
        }
    '''
    twoRegShiftInst("vrshr", "NVrshrD", "SimdShiftOp", allTypes, 2, vrshrCode)
    twoRegShiftInst("vrshr", "NVrshrQ", "SimdShiftOp", allTypes, 4, vrshrCode)

    vrsraCode = '''
        if (imm > sizeof(srcElem1) * 8) {
            destElem += 0;
        } else if (imm) {
            Element rBit = bits(srcElem1, imm - 1);
            destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
        } else {
            destElem += srcElem1;
        }
    '''
    twoRegShiftInst("vrsra", "NVrsraD", "SimdShiftAccOp", allTypes, 2, vrsraCode, True)
    twoRegShiftInst("vrsra", "NVrsraQ", "SimdShiftAccOp", allTypes, 4, vrsraCode, True)

    vsriCode = '''
        if (imm >= sizeof(Element) * 8) {
            destElem = destElem;
        } else {
            destElem = (srcElem1 >> imm) |
                (destElem & ~mask(sizeof(Element) * 8 - imm));
        }
    '''
    twoRegShiftInst("vsri", "NVsriD", "SimdShiftOp", unsignedTypes, 2, vsriCode, True)
    twoRegShiftInst("vsri", "NVsriQ", "SimdShiftOp", unsignedTypes, 4, vsriCode, True)

    vshlCode = '''
        if (imm >= sizeof(Element) * 8) {
            destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
        } else {
            destElem = srcElem1 << imm;
        }
    '''
    twoRegShiftInst("vshl", "NVshlD", "SimdShiftOp", unsignedTypes, 2, vshlCode)
    twoRegShiftInst("vshl", "NVshlQ", "SimdShiftOp", unsignedTypes, 4, vshlCode)

    vsliCode = '''
        if (imm >= sizeof(Element) * 8) {
            destElem = destElem;
        } else {
            destElem = (srcElem1 << imm) | (destElem & mask(imm));
        }
    '''
    twoRegShiftInst("vsli", "NVsliD", "SimdShiftOp", unsignedTypes, 2, vsliCode, True)
    twoRegShiftInst("vsli", "NVsliQ", "SimdShiftOp", unsignedTypes, 4, vsliCode, True)

    vqshlCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm >= sizeof(Element) * 8) {
            if (srcElem1 != 0) {
                destElem = std::numeric_limits<Element>::min();
                if (srcElem1 > 0)
                    destElem = ~destElem;
                fpscr.qc = 1;
            } else {
                destElem = 0;
            }
        } else if (imm) {
            destElem = (srcElem1 << imm);
            uint64_t topBits = bits((uint64_t)srcElem1,
                                    sizeof(Element) * 8 - 1,
                                    sizeof(Element) * 8 - 1 - imm);
            if (topBits != 0 && topBits != mask(imm + 1)) {
                destElem = std::numeric_limits<Element>::min();
                if (srcElem1 > 0)
                    destElem = ~destElem;
                fpscr.qc = 1;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegShiftInst("vqshl", "NVqshlD", "SimdShiftOp", signedTypes, 2, vqshlCode)
    twoRegShiftInst("vqshl", "NVqshlQ", "SimdShiftOp", signedTypes, 4, vqshlCode)

    vqshluCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm >= sizeof(Element) * 8) {
            if (srcElem1 != 0) {
                destElem = mask(sizeof(Element) * 8);
                fpscr.qc = 1;
            } else {
                destElem = 0;
            }
        } else if (imm) {
            destElem = (srcElem1 << imm);
            uint64_t topBits = bits((uint64_t)srcElem1,
                                    sizeof(Element) * 8 - 1,
                                    sizeof(Element) * 8 - imm);
            if (topBits != 0) {
                destElem = mask(sizeof(Element) * 8);
                fpscr.qc = 1;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegShiftInst("vqshlu", "NVqshluD", "SimdShiftOp", unsignedTypes, 2, vqshluCode)
    twoRegShiftInst("vqshlu", "NVqshluQ", "SimdShiftOp", unsignedTypes, 4, vqshluCode)

    vqshlusCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm >= sizeof(Element) * 8) {
            if (srcElem1 < 0) {
                destElem = 0;
                fpscr.qc = 1;
            } else if (srcElem1 > 0) {
                destElem = mask(sizeof(Element) * 8);
                fpscr.qc = 1;
            } else {
                destElem = 0;
            }
        } else if (imm) {
            destElem = (srcElem1 << imm);
            uint64_t topBits = bits((uint64_t)srcElem1,
                                    sizeof(Element) * 8 - 1,
                                    sizeof(Element) * 8 - imm);
            if (srcElem1 < 0) {
                destElem = 0;
                fpscr.qc = 1;
            } else if (topBits != 0) {
                destElem = mask(sizeof(Element) * 8);
                fpscr.qc = 1;
            }
        } else {
            if (srcElem1 < 0) {
                fpscr.qc = 1;
                destElem = 0;
            } else {
                destElem = srcElem1;
            }
        }
        FpscrQc = fpscr;
    '''
    twoRegShiftInst("vqshlus", "NVqshlusD", "SimdShiftOp", signedTypes, 2, vqshlusCode)
    twoRegShiftInst("vqshlus", "NVqshlusQ", "SimdShiftOp", signedTypes, 4, vqshlusCode)

    vshrnCode = '''
        if (imm >= sizeof(srcElem1) * 8) {
            destElem = 0;
        } else {
            destElem = srcElem1 >> imm;
        }
    '''
    twoRegNarrowShiftInst("vshrn", "NVshrn", "SimdShiftOp", smallUnsignedTypes, vshrnCode)

    vrshrnCode = '''
        if (imm > sizeof(srcElem1) * 8) {
            destElem = 0;
        } else if (imm) {
            Element rBit = bits(srcElem1, imm - 1);
            destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
        } else {
            destElem = srcElem1;
        }
    '''
    twoRegNarrowShiftInst("vrshrn", "NVrshrn", "SimdShiftOp", smallUnsignedTypes, vrshrnCode)

    vqshrnCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm > sizeof(srcElem1) * 8) {
            if (srcElem1 != 0 && srcElem1 != -1)
                fpscr.qc = 1;
            destElem = 0;
        } else if (imm) {
            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
            mid |= -(mid & ((BigElement)1 <<
                        (sizeof(BigElement) * 8 - 1 - imm)));
            if (mid != (Element)mid) {
                destElem = mask(sizeof(Element) * 8 - 1);
                if (srcElem1 < 0)
                    destElem = ~destElem;
                fpscr.qc = 1;
            } else {
                destElem = mid;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegNarrowShiftInst("vqshrn", "NVqshrn", "SimdShiftOp", smallSignedTypes, vqshrnCode)

    vqshrunCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm > sizeof(srcElem1) * 8) {
            if (srcElem1 != 0)
                fpscr.qc = 1;
            destElem = 0;
        } else if (imm) {
            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
            if (mid != (Element)mid) {
                destElem = mask(sizeof(Element) * 8);
                fpscr.qc = 1;
            } else {
                destElem = mid;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegNarrowShiftInst("vqshrun", "NVqshrun",
                          "SimdShiftOp", smallUnsignedTypes, vqshrunCode)

    vqshrunsCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm > sizeof(srcElem1) * 8) {
            if (srcElem1 != 0)
                fpscr.qc = 1;
            destElem = 0;
        } else if (imm) {
            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
            if (bits(mid, sizeof(BigElement) * 8 - 1,
                          sizeof(Element) * 8) != 0) {
                if (srcElem1 < 0) {
                    destElem = 0;
                } else {
                    destElem = mask(sizeof(Element) * 8);
                }
                fpscr.qc = 1;
            } else {
                destElem = mid;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegNarrowShiftInst("vqshrun", "NVqshruns",
                          "SimdShiftOp", smallSignedTypes, vqshrunsCode)

    vqrshrnCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm > sizeof(srcElem1) * 8) {
            if (srcElem1 != 0 && srcElem1 != -1)
                fpscr.qc = 1;
            destElem = 0;
        } else if (imm) {
            BigElement mid = (srcElem1 >> (imm - 1));
            uint64_t rBit = mid & 0x1;
            mid >>= 1;
            mid |= -(mid & ((BigElement)1 <<
                        (sizeof(BigElement) * 8 - 1 - imm)));
            mid += rBit;
            if (mid != (Element)mid) {
                destElem = mask(sizeof(Element) * 8 - 1);
                if (srcElem1 < 0)
                    destElem = ~destElem;
                fpscr.qc = 1;
            } else {
                destElem = mid;
            }
        } else {
            if (srcElem1 != (Element)srcElem1) {
                destElem = mask(sizeof(Element) * 8 - 1);
                if (srcElem1 < 0)
                    destElem = ~destElem;
                fpscr.qc = 1;
            } else {
                destElem = srcElem1;
            }
        }
        FpscrQc = fpscr;
    '''
    twoRegNarrowShiftInst("vqrshrn", "NVqrshrn",
                          "SimdShiftOp", smallSignedTypes, vqrshrnCode)

    vqrshrunCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm > sizeof(srcElem1) * 8) {
            if (srcElem1 != 0)
                fpscr.qc = 1;
            destElem = 0;
        } else if (imm) {
            BigElement mid = (srcElem1 >> (imm - 1));
            uint64_t rBit = mid & 0x1;
            mid >>= 1;
            mid += rBit;
            if (mid != (Element)mid) {
                destElem = mask(sizeof(Element) * 8);
                fpscr.qc = 1;
            } else {
                destElem = mid;
            }
        } else {
            if (srcElem1 != (Element)srcElem1) {
                destElem = mask(sizeof(Element) * 8 - 1);
                fpscr.qc = 1;
            } else {
                destElem = srcElem1;
            }
        }
        FpscrQc = fpscr;
    '''
    twoRegNarrowShiftInst("vqrshrun", "NVqrshrun",
                          "SimdShiftOp", smallUnsignedTypes, vqrshrunCode)

    vqrshrunsCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm > sizeof(srcElem1) * 8) {
            if (srcElem1 != 0)
                fpscr.qc = 1;
            destElem = 0;
        } else if (imm) {
            BigElement mid = (srcElem1 >> (imm - 1));
            uint64_t rBit = mid & 0x1;
            mid >>= 1;
            mid |= -(mid & ((BigElement)1 <<
                            (sizeof(BigElement) * 8 - 1 - imm)));
            mid += rBit;
            if (bits(mid, sizeof(BigElement) * 8 - 1,
                          sizeof(Element) * 8) != 0) {
                if (srcElem1 < 0) {
                    destElem = 0;
                } else {
                    destElem = mask(sizeof(Element) * 8);
                }
                fpscr.qc = 1;
            } else {
                destElem = mid;
            }
        } else {
            if (srcElem1 < 0) {
                fpscr.qc = 1;
                destElem = 0;
            } else {
                destElem = srcElem1;
            }
        }
        FpscrQc = fpscr;
    '''
    twoRegNarrowShiftInst("vqrshrun", "NVqrshruns",
                          "SimdShiftOp", smallSignedTypes, vqrshrunsCode)

    vshllCode = '''
        if (imm >= sizeof(destElem) * 8) {
            destElem = 0;
        } else {
            destElem = (BigElement)srcElem1 << imm;
        }
    '''
    twoRegLongShiftInst("vshll", "NVshll", "SimdShiftOp", smallTypes, vshllCode)

    vmovlCode = '''
        destElem = srcElem1;
    '''
    twoRegLongShiftInst("vmovl", "NVmovl", "SimdMiscOp", smallTypes, vmovlCode)

    vcvt2ufxCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        if (flushToZero(srcElem1))
            fpscr.idc = 1;
        VfpSavedState state = prepFpState(VfpRoundNearest);
        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
        destReg = vfpFpToFixed<float>(srcElem1, false, 32, imm);
        __asm__ __volatile__("" :: "m" (destReg));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''
    twoRegShiftInst("vcvt", "NVcvt2ufxD", "SimdCvtOp", ("float",),
            2, vcvt2ufxCode, toInt = True)
    twoRegShiftInst("vcvt", "NVcvt2ufxQ", "SimdCvtOp", ("float",),
            4, vcvt2ufxCode, toInt = True)

    vcvt2ufxHCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibFPToFixed<uint16_t, uint16_t>(
            srcElem1, imm, true, FPRounding_ZERO, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegShiftInst("vcvt", "NVcvt2ufxHD", "SimdCvtOp", ("uint16_t",),
            2, vcvt2ufxHCode)
    twoRegShiftInst("vcvt", "NVcvt2ufxHQ", "SimdCvtOp", ("uint16_t",),
            4, vcvt2ufxHCode)

    vcvt2sfxCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        if (flushToZero(srcElem1))
            fpscr.idc = 1;
        VfpSavedState state = prepFpState(VfpRoundNearest);
        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
        destReg = vfpFpToFixed<float>(srcElem1, true, 32, imm);
        __asm__ __volatile__("" :: "m" (destReg));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''
    twoRegShiftInst("vcvt", "NVcvt2sfxD", "SimdCvtOp", ("float",),
            2, vcvt2sfxCode, toInt = True)
    twoRegShiftInst("vcvt", "NVcvt2sfxQ", "SimdCvtOp", ("float",),
            4, vcvt2sfxCode, toInt = True)

    vcvt2sfxHCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibFPToFixed<uint16_t, uint16_t>(
            srcElem1, imm, false, FPRounding_ZERO, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegShiftInst("vcvt", "NVcvt2sfxHD", "SimdCvtOp", ("uint16_t",),
            2, vcvt2sfxHCode)
    twoRegShiftInst("vcvt", "NVcvt2sfxHQ", "SimdCvtOp", ("uint16_t",),
            4, vcvt2sfxHCode)

    vcvtu2fpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        VfpSavedState state = prepFpState(VfpRoundNearest);
        __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1));
        destElem = vfpUFixedToFpS(true, true, srcReg1, 32, imm);
        __asm__ __volatile__("" :: "m" (destElem));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''
    twoRegShiftInst("vcvt", "NVcvtu2fpD", "SimdCvtOp", ("float",),
            2, vcvtu2fpCode, fromInt = True)
    twoRegShiftInst("vcvt", "NVcvtu2fpQ", "SimdCvtOp", ("float",),
            4, vcvtu2fpCode, fromInt = True)

    vcvtu2fpHCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibFixedToFP<uint16_t>(
            srcElem1, imm, true, FPRounding_TIEEVEN, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegShiftInst("vcvt", "NVcvtu2fpHD", "SimdCvtOp", ("uint16_t",),
            2, vcvtu2fpHCode)
    twoRegShiftInst("vcvt", "NVcvtu2fpHQ", "SimdCvtOp", ("uint16_t",),
            4, vcvtu2fpHCode)

    vcvts2fpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        VfpSavedState state = prepFpState(VfpRoundNearest);
        __asm__ __volatile__("" : "=m" (srcReg1) : "m" (srcReg1));
        destElem = vfpSFixedToFpS(true, true, srcReg1, 32, imm);
        __asm__ __volatile__("" :: "m" (destElem));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''
    twoRegShiftInst("vcvt", "NVcvts2fpD", "SimdCvtOp", ("float",),
            2, vcvts2fpCode, fromInt = True)
    twoRegShiftInst("vcvt", "NVcvts2fpQ", "SimdCvtOp", ("float",),
            4, vcvts2fpCode, fromInt = True)

    vcvts2fpHCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibFixedToFP<uint16_t>(
            sext<16>(srcElem1), imm, false, FPRounding_TIEEVEN, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegShiftInst("vcvt", "NVcvts2fpHD", "SimdCvtOp", ("uint16_t",),
            2, vcvts2fpHCode)
    twoRegShiftInst("vcvt", "NVcvts2fpHQ", "SimdCvtOp", ("uint16_t",),
            4, vcvts2fpHCode)

    vcvts2hCode = '''
        destElem = 0;
        FPSCR fpscr = (FPSCR) FpscrExc;
        float srcFp1 = bitsToFp(srcElem1, (float)0.0);
        if (flushToZero(srcFp1))
            fpscr.idc = 1;
        VfpSavedState state = prepFpState(VfpRoundNearest);
        __asm__ __volatile__("" : "=m" (srcFp1), "=m" (destElem)
                                : "m" (srcFp1), "m" (destElem));
        destElem = vcvtFpSFpH(fpscr, true, true, VfpRoundNearest,
                              fpscr.ahp, srcFp1);
        __asm__ __volatile__("" :: "m" (destElem));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''
    twoRegNarrowMiscInst("vcvt", "NVcvts2h", "SimdCvtOp", ("uint16_t",), vcvts2hCode)

    vcvts2bfCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibConvertBF(srcElem1, FPCRRounding(fpscr), fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegNarrowMiscInst("vcvt", "NVcvts2bf", "SimdCvtOp", ("uint16_t",),
                         vcvts2bfCode, extra_check=bf16_check)

    vcvth2sCode = '''
        destElem = 0;
        FPSCR fpscr = (FPSCR) FpscrExc;
        VfpSavedState state = prepFpState(VfpRoundNearest);
        __asm__ __volatile__("" : "=m" (srcElem1), "=m" (destElem)
                                : "m" (srcElem1), "m" (destElem));
        destElem = fpToBits(vcvtFpHFpS(fpscr, true, fpscr.ahp, srcElem1));
        __asm__ __volatile__("" :: "m" (destElem));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''
    twoRegLongMiscInst("vcvt", "NVcvth2s", "SimdCvtOp", ("uint16_t",), vcvth2sCode)

    vcvthp2hCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibFPToFixed<uint16_t, uint16_t>(
            srcElem1, 0, %s, %s, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''

    vcvtahp2uhCode = vcvthp2hCode % ("true", "FPRounding_TIEAWAY")
    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAD", "SimdCvtOp",
                   ("uint16_t",), 2, vcvtahp2uhCode)
    twoRegMiscInst("vcvta.u16.f16", "NVcvt2uhAQ", "SimdCvtOp",
                   ("uint16_t",), 4, vcvtahp2uhCode)

    vcvtnhp2uhCode = vcvthp2hCode % ("true", "FPRounding_TIEEVEN")
    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhND", "SimdCvtOp",
                   ("uint16_t",), 2, vcvtnhp2uhCode)
    twoRegMiscInst("vcvtn.u16.f16", "NVcvt2uhNQ", "SimdCvtOp",
                   ("uint16_t",), 4, vcvtnhp2uhCode)

    vcvtphp2uhCode = vcvthp2hCode % ("true", "FPRounding_POSINF")
    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPD", "SimdCvtOp",
                   ("uint16_t",), 2, vcvtphp2uhCode)
    twoRegMiscInst("vcvtp.u16.f16", "NVcvt2uhPQ", "SimdCvtOp",
                   ("uint16_t",), 4, vcvtphp2uhCode)

    vcvtmhp2uhCode = vcvthp2hCode % ("true", "FPRounding_NEGINF")
    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMD", "SimdCvtOp",
                   ("uint16_t",), 2, vcvtmhp2uhCode)
    twoRegMiscInst("vcvtm.u16.f16", "NVcvt2uhMQ", "SimdCvtOp",
                   ("uint16_t",), 4, vcvtmhp2uhCode)

    vcvtahp2shCode = vcvthp2hCode % ("false", "FPRounding_TIEAWAY")
    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAD", "SimdCvtOp",
                   ("int16_t",), 2, vcvtahp2shCode)
    twoRegMiscInst("vcvta.s16.f16", "NVcvt2shAQ", "SimdCvtOp",
                   ("int16_t",), 4, vcvtahp2shCode)

    vcvtnhp2shCode = vcvthp2hCode % ("false", "FPRounding_TIEEVEN")
    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shND", "SimdCvtOp",
                   ("int16_t",), 2, vcvtnhp2shCode)
    twoRegMiscInst("vcvtn.s16.f16", "NVcvt2shNQ", "SimdCvtOp",
                   ("int16_t",), 4, vcvtnhp2shCode)

    vcvtphp2shCode = vcvthp2hCode % ("false", "FPRounding_POSINF")
    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPD", "SimdCvtOp",
                   ("int16_t",), 2, vcvtphp2shCode)
    twoRegMiscInst("vcvtp.s16.f16", "NVcvt2shPQ", "SimdCvtOp",
                   ("int16_t",), 4, vcvtphp2shCode)

    vcvtmhp2shCode = vcvthp2hCode % ("false", "FPRounding_NEGINF")
    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMD", "SimdCvtOp",
                   ("int16_t",), 2, vcvtmhp2shCode)
    twoRegMiscInst("vcvtm.s16.f16", "NVcvt2shMQ", "SimdCvtOp",
                   ("int16_t",), 4, vcvtmhp2shCode)

    vcvtsp2sCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        VfpSavedState state = prepFpState(fpscr.rMode);
        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
        float mid = bitsToFp(srcElem1, (float)0.0);
        if (flushToZero(mid))
            fpscr.idc = 1;
        destElem = vfpFpToFixed<float>(mid, %s, 32, 0, true, %s);
        __asm__ __volatile__("" :: "m" (destElem));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''

    vcvtasp2usCode = vcvtsp2sCode % ("false", "VfpRoundAway")
    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAD", "SimdCvtOp",
                   ("uint32_t",), 2, vcvtasp2usCode)
    twoRegMiscInst("vcvta.u32.f32", "NVcvt2usAQ", "SimdCvtOp",
                   ("uint32_t",), 4, vcvtasp2usCode)

    vcvtnsp2usCode = vcvtsp2sCode % ("false", "VfpRoundNearest")
    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usND", "SimdCvtOp",
                   ("uint32_t",), 2, vcvtnsp2usCode)
    twoRegMiscInst("vcvtn.u32.f32", "NVcvt2usNQ", "SimdCvtOp",
                   ("uint32_t",), 4, vcvtnsp2usCode)

    vcvtpsp2usCode = vcvtsp2sCode % ("false", "VfpRoundUpward")
    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPD", "SimdCvtOp",
                   ("uint32_t",), 2, vcvtpsp2usCode)
    twoRegMiscInst("vcvtp.u32.f32", "NVcvt2usPQ", "SimdCvtOp",
                   ("uint32_t",), 4, vcvtpsp2usCode)

    vcvtmsp2usCode = vcvtsp2sCode % ("false", "VfpRoundDown")
    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMD", "SimdCvtOp",
                   ("uint32_t",), 2, vcvtmsp2usCode)
    twoRegMiscInst("vcvtm.u32.f32", "NVcvt2usMQ", "SimdCvtOp",
                   ("uint32_t",), 4, vcvtmsp2usCode)

    vcvtasp2ssCode = vcvtsp2sCode % ("true", "VfpRoundAway")
    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAD", "SimdCvtOp",
                   ("int32_t",), 2, vcvtasp2ssCode)
    twoRegMiscInst("vcvta.s32.f32", "NVcvt2ssAQ", "SimdCvtOp",
                   ("int32_t",), 4, vcvtasp2ssCode)

    vcvtnsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundNearest")
    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssND", "SimdCvtOp",
                   ("int32_t",), 2, vcvtnsp2ssCode)
    twoRegMiscInst("vcvtn.s32.f32", "NVcvt2ssNQ", "SimdCvtOp",
                   ("int32_t",), 4, vcvtnsp2ssCode)

    vcvtpsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundUpward")
    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPD", "SimdCvtOp",
                   ("int32_t",), 2, vcvtpsp2ssCode)
    twoRegMiscInst("vcvtp.s32.f32", "NVcvt2ssPQ", "SimdCvtOp",
                   ("int32_t",), 4, vcvtpsp2ssCode)

    vcvtmsp2ssCode = vcvtsp2sCode % ("true", "VfpRoundDown")
    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMD", "SimdCvtOp",
                   ("int32_t",), 2, vcvtmsp2ssCode)
    twoRegMiscInst("vcvtm.s32.f32", "NVcvt2ssMQ", "SimdCvtOp",
                   ("int32_t",), 4, vcvtmsp2ssCode)

    vrinthpCode = '''
        FPSCR fpscr = fpVASimdCvtFPSCRValue((FPSCR)FpscrExc);
        destElem = fplibRoundInt<uint16_t>(srcElem1, %s, %s, fpscr);
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    vrintnhpCode = vrinthpCode % ("FPRounding_TIEEVEN", "false")
    twoRegMiscInst("vrintn.f16", "NVrintnhpD", "SimdCvtOp",
                   ("uint16_t",), 2, vrintnhpCode)
    twoRegMiscInst("vrintn.f16", "NVrintnhpQ", "SimdCvtOp",
                   ("uint16_t",), 4, vrintnhpCode)
    vrintxhpCode = vrinthpCode % ("FPRounding_TIEEVEN", "true")
    twoRegMiscInst("vrintx.f16", "NVrintxhpD", "SimdCvtOp",
                   ("uint16_t",), 2, vrintxhpCode)
    twoRegMiscInst("vrintx.f16", "NVrintxhpQ", "SimdCvtOp",
                   ("uint16_t",), 4, vrintxhpCode)
    vrintahpCode = vrinthpCode % ("FPRounding_TIEAWAY", "false")
    twoRegMiscInst("vrinta.f16", "NVrintahpD", "SimdCvtOp",
                   ("uint16_t",), 2, vrintahpCode)
    twoRegMiscInst("vrinta.f16", "NVrintahpQ", "SimdCvtOp",
                   ("uint16_t",), 4, vrintahpCode)
    vrintzhpCode = vrinthpCode % ("FPRounding_ZERO", "false")
    twoRegMiscInst("vrintz.f16", "NVrintzhpD", "SimdCvtOp",
                   ("uint16_t",), 2, vrintzhpCode)
    twoRegMiscInst("vrintz.f16", "NVrintzhpQ", "SimdCvtOp",
                   ("uint16_t",), 4, vrintzhpCode)
    vrintmhpCode = vrinthpCode % ("FPRounding_NEGINF", "false")
    twoRegMiscInst("vrintm.f16", "NVrintmhpD", "SimdCvtOp",
                   ("uint16_t",), 2, vrintmhpCode)
    twoRegMiscInst("vrintm.f16", "NVrintmhpQ", "SimdCvtOp",
                   ("uint16_t",), 4, vrintmhpCode)
    vrintphpCode = vrinthpCode % ("FPRounding_POSINF", "false")
    twoRegMiscInst("vrintp.f16", "NVrintphpD", "SimdCvtOp",
                   ("uint16_t",), 2, vrintphpCode)
    twoRegMiscInst("vrintp.f16", "NVrintphpQ", "SimdCvtOp",
                   ("uint16_t",), 4, vrintphpCode)

    vrintspCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        VfpSavedState state = prepFpState(fpscr.rMode);
        __asm__ __volatile__("" : "=m" (srcElem1) : "m" (srcElem1));
        float mid = bitsToFp(srcElem1, (float)0.0);
        if (flushToZero(mid))
            fpscr.idc = 1;
        float mid2 = vfpFpRint<float>(mid, %s, fpscr.dn, true, %s);
        destElem = fpToBits(mid2);
        __asm__ __volatile__("" :: "m" (destElem));
        finishVfp(fpscr, state, true);
        FpscrExc = fpscr;
    '''

    vrintnspCode = vrintspCode % ("false", "VfpRoundNearest")
    twoRegMiscInst("vrintn.f32", "NVrintnspD", "SimdCvtOp",
                   ("uint32_t",), 2, vrintnspCode)
    twoRegMiscInst("vrintn.f32", "NVrintnspQ", "SimdCvtOp",
                   ("uint32_t",), 4, vrintnspCode)

    vrintxspCode = vrintspCode % ("true", "VfpRoundNearest")
    twoRegMiscInst("vrintx.f32", "NVrintxspD", "SimdCvtOp",
                   ("uint32_t",), 2, vrintxspCode)
    twoRegMiscInst("vrintx.f32", "NVrintxspQ", "SimdCvtOp",
                   ("uint32_t",), 4, vrintxspCode)

    vrintaspCode = vrintspCode % ("false", "VfpRoundAway")
    twoRegMiscInst("vrinta.f32", "NVrintaspD", "SimdCvtOp",
                   ("uint32_t",), 2, vrintaspCode)
    twoRegMiscInst("vrinta.f32", "NVrintaspQ", "SimdCvtOp",
                   ("uint32_t",), 4, vrintaspCode)

    vrintzspCode = vrintspCode % ("false", "VfpRoundZero")
    twoRegMiscInst("vrintz.f32", "NVrintzspD", "SimdCvtOp",
                   ("uint32_t",), 2, vrintzspCode)
    twoRegMiscInst("vrintz.f32", "NVrintzspQ", "SimdCvtOp",
                   ("uint32_t",), 4, vrintzspCode)

    vrintmspCode = vrintspCode % ("false", "VfpRoundDown")
    twoRegMiscInst("vrintm.f32", "NVrintmspD", "SimdCvtOp",
                   ("uint32_t",), 2, vrintmspCode)
    twoRegMiscInst("vrintm.f32", "NVrintmspQ", "SimdCvtOp",
                   ("uint32_t",), 4, vrintmspCode)

    vrintpspCode = vrintspCode % ("false", "VfpRoundUpward")
    twoRegMiscInst("vrintp.f32", "NVrintpspD", "SimdCvtOp",
                   ("uint32_t",), 2, vrintpspCode)
    twoRegMiscInst("vrintp.f32", "NVrintpspQ", "SimdCvtOp",
                   ("uint32_t",), 4, vrintpspCode)

    vrsqrteCode = '''
        destElem = unsignedRSqrtEstimate(srcElem1);
    '''
    twoRegMiscInst("vrsqrte", "NVrsqrteD", "SimdSqrtOp", ("uint32_t",), 2, vrsqrteCode)
    twoRegMiscInst("vrsqrte", "NVrsqrteQ", "SimdSqrtOp", ("uint32_t",), 4, vrsqrteCode)

    vrsqrtefpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        if (flushToZero(srcReg1))
            fpscr.idc = 1;
        destReg = fprSqrtEstimate(fpscr, srcReg1);
        FpscrExc = fpscr;
    '''
    twoRegMiscInstFp("vrsqrte", "NVrsqrteDFp", "SimdFloatSqrtOp", ("float",), 2, vrsqrtefpCode)
    twoRegMiscInstFp("vrsqrte", "NVrsqrteQFp", "SimdFloatSqrtOp", ("float",), 4, vrsqrtefpCode)

    vrsqrtefpHCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destElem = fprSqrtEstimateFpH(fpscr, srcElem1);
        FpscrExc = fpscr;
    '''
    twoRegMiscInst("vrsqrte", "NVrsqrteDFpH", "SimdFloatSqrtOp", ("uint16_t",),
                   2, vrsqrtefpHCode)
    twoRegMiscInst("vrsqrte", "NVrsqrteQFpH", "SimdFloatSqrtOp", ("uint16_t",),
                   4, vrsqrtefpHCode)

    vrecpeCode = '''
        destElem = unsignedRecipEstimate(srcElem1);
    '''
    twoRegMiscInst("vrecpe", "NVrecpeD", "SimdMultAccOp", ("uint32_t",), 2, vrecpeCode)
    twoRegMiscInst("vrecpe", "NVrecpeQ", "SimdMultAccOp", ("uint32_t",), 4, vrecpeCode)

    vrecpefpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        if (flushToZero(srcReg1))
            fpscr.idc = 1;
        destReg = fpRecipEstimate(fpscr, srcReg1);
        FpscrExc = fpscr;
    '''
    twoRegMiscInstFp("vrecpe", "NVrecpeDFp", "SimdFloatMultAccOp", ("float",), 2, vrecpefpCode)
    twoRegMiscInstFp("vrecpe", "NVrecpeQFp", "SimdFloatMultAccOp", ("float",), 4, vrecpefpCode)

    vrecpefpHCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        destElem = fpRecipEstimateFpH(fpscr, srcElem1);
        FpscrExc = fpscr;
    '''
    twoRegMiscInst("vrecpe", "NVrecpeDFpH", "SimdMultAccOp", ("uint16_t",), 2,
                   vrecpefpHCode)
    twoRegMiscInst("vrecpe", "NVrecpeQFpH", "SimdMultAccOp", ("uint16_t",), 4,
                   vrecpefpHCode)

    vrev16Code = '''
        destElem = srcElem1;
        unsigned groupSize = ((1 << 1) / sizeof(Element));
        unsigned reverseMask = (groupSize - 1);
        j = i ^ reverseMask;
    '''
    twoRegMiscInst("vrev16", "NVrev16D", "SimdAluOp", ("uint8_t",), 2, vrev16Code)
    twoRegMiscInst("vrev16", "NVrev16Q", "SimdAluOp", ("uint8_t",), 4, vrev16Code)
    vrev32Code = '''
        destElem = srcElem1;
        unsigned groupSize = ((1 << 2) / sizeof(Element));
        unsigned reverseMask = (groupSize - 1);
        j = i ^ reverseMask;
    '''
    twoRegMiscInst("vrev32", "NVrev32D",
            "SimdAluOp", ("uint8_t", "uint16_t"), 2, vrev32Code)
    twoRegMiscInst("vrev32", "NVrev32Q",
            "SimdAluOp", ("uint8_t", "uint16_t"), 4, vrev32Code)
    vrev64Code = '''
        destElem = srcElem1;
        unsigned groupSize = ((1 << 3) / sizeof(Element));
        unsigned reverseMask = (groupSize - 1);
        j = i ^ reverseMask;
    '''
    twoRegMiscInst("vrev64", "NVrev64D", "SimdAluOp", smallUnsignedTypes, 2, vrev64Code)
    twoRegMiscInst("vrev64", "NVrev64Q", "SimdAluOp", smallUnsignedTypes, 4, vrev64Code)

    split('exec')
    exec_output += vcompares + vcomparesL

    vpaddlCode = '''
        destElem = (BigElement)srcElem1 + (BigElement)srcElem2;
    '''
    twoRegCondenseInst("vpaddl", "NVpaddlD", "SimdAddOp", smallTypes, 2, vpaddlCode)
    twoRegCondenseInst("vpaddl", "NVpaddlQ", "SimdAddOp", smallTypes, 4, vpaddlCode)

    vpadalCode = '''
        destElem += (BigElement)srcElem1 + (BigElement)srcElem2;
    '''
    twoRegCondenseInst("vpadal", "NVpadalD", "SimdAddAccOp", smallTypes, 2, vpadalCode, True)
    twoRegCondenseInst("vpadal", "NVpadalQ", "SimdAddAccOp", smallTypes, 4, vpadalCode, True)

    vclsCode = '''
        unsigned count = 0;
        if (srcElem1 < 0) {
            srcElem1 <<= 1;
            while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
                count++;
                srcElem1 <<= 1;
            }
        } else {
            srcElem1 <<= 1;
            while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
                count++;
                srcElem1 <<= 1;
            }
        }
        destElem = count;
    '''
    twoRegMiscInst("vcls", "NVclsD", "SimdAluOp", signedTypes, 2, vclsCode)
    twoRegMiscInst("vcls", "NVclsQ", "SimdAluOp", signedTypes, 4, vclsCode)

    vclzCode = '''
        unsigned count = 0;
        while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
            count++;
            srcElem1 <<= 1;
        }
        destElem = count;
    '''
    twoRegMiscInst("vclz", "NVclzD", "SimdAluOp", signedTypes, 2, vclzCode)
    twoRegMiscInst("vclz", "NVclzQ", "SimdAluOp", signedTypes, 4, vclzCode)

    vcntCode = '''
        unsigned count = 0;
        while (srcElem1 && count < sizeof(Element) * 8) {
            count += srcElem1 & 0x1;
            srcElem1 >>= 1;
        }
        destElem = count;
    '''

    twoRegMiscInst("vcnt", "NVcntD", "SimdAluOp", unsignedTypes, 2, vcntCode)
    twoRegMiscInst("vcnt", "NVcntQ", "SimdAluOp", unsignedTypes, 4, vcntCode)

    vmvnCode = '''
        destElem = ~srcElem1;
    '''
    twoRegMiscInst("vmvn", "NVmvnD", "SimdAluOp", ("uint64_t",), 2, vmvnCode)
    twoRegMiscInst("vmvn", "NVmvnQ", "SimdAluOp", ("uint64_t",), 4, vmvnCode)

    vqabsCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
            fpscr.qc = 1;
            destElem = ~srcElem1;
        } else if (srcElem1 < 0) {
            destElem = -srcElem1;
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegMiscInst("vqabs", "NVqabsD", "SimdAluOp", signedTypes, 2, vqabsCode)
    twoRegMiscInst("vqabs", "NVqabsQ", "SimdAluOp", signedTypes, 4, vqabsCode)

    vqnegCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
            fpscr.qc = 1;
            destElem = ~srcElem1;
        } else {
            destElem = -srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegMiscInst("vqneg", "NVqnegD", "SimdAluOp", signedTypes, 2, vqnegCode)
    twoRegMiscInst("vqneg", "NVqnegQ", "SimdAluOp", signedTypes, 4, vqnegCode)

    vabsCode = '''
        if (srcElem1 < 0) {
            destElem = -srcElem1;
        } else {
            destElem = srcElem1;
        }
    '''

    twoRegMiscInst("vabs", "NVabsD", "SimdAluOp", signedTypes, 2, vabsCode)
    twoRegMiscInst("vabs", "NVabsQ", "SimdAluOp", signedTypes, 4, vabsCode)

    vabsfpCode = '''
        union
        {
            uint32_t i;
            float f;
        } cStruct;
        cStruct.f = srcReg1;
        cStruct.i &= mask(sizeof(Element) * 8 - 1);
        destReg = cStruct.f;
    '''
    twoRegMiscInstFp("vabs", "NVabsDFp", "SimdFloatAluOp", ("float",), 2, vabsfpCode)
    twoRegMiscInstFp("vabs", "NVabsQFp", "SimdFloatAluOp", ("float",), 4, vabsfpCode)

    vabsfpHCode = '''
        destElem = srcElem1 & mask(sizeof(uint16_t) * 8 - 1);
    '''
    twoRegMiscInst("vabs", "NVabsDFpH", "SimdFloatAluOp", ("uint16_t",),
            2, vabsfpHCode)
    twoRegMiscInst("vabs", "NVabsQFpH", "SimdFloatAluOp", ("uint16_t",),
            4, vabsfpHCode)

    vnegCode = '''
        destElem = -srcElem1;
    '''
    twoRegMiscInst("vneg", "NVnegD", "SimdAluOp", signedTypes, 2, vnegCode)
    twoRegMiscInst("vneg", "NVnegQ", "SimdAluOp", signedTypes, 4, vnegCode)

    vnegfpCode = '''
        destReg = -srcReg1;
    '''
    twoRegMiscInstFp("vneg", "NVnegDFp", "SimdFloatAluOp", ("float",), 2, vnegfpCode)
    twoRegMiscInstFp("vneg", "NVnegQFp", "SimdFloatAluOp", ("float",), 4, vnegfpCode)

    vnegfpHCode = '''
        destElem = fplibNeg(srcElem1);
    '''
    twoRegMiscInst("vneg", "NVnegDFpH", "SimdFloatAluOp", ("uint16_t",),
            2, vnegfpHCode)
    twoRegMiscInst("vneg", "NVnegQFpH", "SimdFloatAluOp", ("uint16_t",),
            4, vnegfpHCode)

    vcgtCode = 'destElem = (srcElem1 > 0) ? mask(sizeof(Element) * 8) : 0;'
    twoRegMiscInst("vcgt", "NVcgtD", "SimdCmpOp", signedTypes, 2, vcgtCode)
    twoRegMiscInst("vcgt", "NVcgtQ", "SimdCmpOp", signedTypes, 4, vcgtCode)

    vcgtfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, (float)0.0, vcgtFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    twoRegMiscInstFp("vcgt", "NVcgtDFp", "SimdFloatCmpOp", ("float",),
            2, vcgtfpCode, toInt = True)
    twoRegMiscInstFp("vcgt", "NVcgtQFp", "SimdFloatCmpOp", ("float",),
            4, vcgtfpCode, toInt = True)

    vcgtfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGT(srcElem1, (Element)0, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegMiscInst("vcgt", "NVcgtDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vcgtfpHCode)
    twoRegMiscInst("vcgt", "NVcgtQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vcgtfpHCode)

    vcgeCode = 'destElem = (srcElem1 >= 0) ? mask(sizeof(Element) * 8) : 0;'
    twoRegMiscInst("vcge", "NVcgeD", "SimdCmpOp", signedTypes, 2, vcgeCode)
    twoRegMiscInst("vcge", "NVcgeQ", "SimdCmpOp", signedTypes, 4, vcgeCode)

    vcgefpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, (float)0.0, vcgeFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    twoRegMiscInstFp("vcge", "NVcgeDFp", "SimdFloatCmpOp", ("float",),
            2, vcgefpCode, toInt = True)
    twoRegMiscInstFp("vcge", "NVcgeQFp", "SimdFloatCmpOp", ("float",),
            4, vcgefpCode, toInt = True)

    vcgefpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGE(srcElem1, (Element)0, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegMiscInst("vcge", "NVcgeDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vcgefpHCode)
    twoRegMiscInst("vcge", "NVcgeQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vcgefpHCode)

    vceqCode = 'destElem = (srcElem1 == 0) ? mask(sizeof(Element) * 8) : 0;'
    twoRegMiscInst("vceq", "NVceqD", "SimdCmpOp", signedTypes, 2, vceqCode)
    twoRegMiscInst("vceq", "NVceqQ", "SimdCmpOp", signedTypes, 4, vceqCode)

    vceqfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, (float)0.0, vceqFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    twoRegMiscInstFp("vceq", "NVceqDFp", "SimdFloatCmpOp", ("float",),
            2, vceqfpCode, toInt = True)
    twoRegMiscInstFp("vceq", "NVceqQFp", "SimdFloatCmpOp", ("float",),
            4, vceqfpCode, toInt = True)

    vceqfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareEQ(srcElem1, (Element)0, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegMiscInst("vceq", "NVceqDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vceqfpHCode)
    twoRegMiscInst("vceq", "NVceqQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vceqfpHCode)

    vcleCode = 'destElem = (srcElem1 <= 0) ? mask(sizeof(Element) * 8) : 0;'
    twoRegMiscInst("vcle", "NVcleD", "SimdCmpOp", signedTypes, 2, vcleCode)
    twoRegMiscInst("vcle", "NVcleQ", "SimdCmpOp", signedTypes, 4, vcleCode)

    vclefpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, (float)0.0, vcleFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    twoRegMiscInstFp("vcle", "NVcleDFp", "SimdFloatCmpOp", ("float",),
            2, vclefpCode, toInt = True)
    twoRegMiscInstFp("vcle", "NVcleQFp", "SimdFloatCmpOp", ("float",),
            4, vclefpCode, toInt = True)

    vclefpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGE((Element)0, srcElem1, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegMiscInst("vcle", "NVcleDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vclefpHCode)
    twoRegMiscInst("vcle", "NVcleQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vclefpHCode)

    vcltCode = 'destElem = (srcElem1 < 0) ? mask(sizeof(Element) * 8) : 0;'
    twoRegMiscInst("vclt", "NVcltD", "SimdCmpOp", signedTypes, 2, vcltCode)
    twoRegMiscInst("vclt", "NVcltQ", "SimdCmpOp", signedTypes, 4, vcltCode)

    vcltfpCode = '''
        FPSCR fpscr = (FPSCR) FpscrExc;
        float res = binaryOp(fpscr, srcReg1, (float)0.0, vcltFunc,
                             true, true, VfpRoundNearest);
        destReg = (res == 0) ? -1 : 0;
        if (res == 2.0)
            fpscr.ioc = 1;
        FpscrExc = fpscr;
    '''
    twoRegMiscInstFp("vclt", "NVcltDFp", "SimdFloatCmpOp", ("float",),
            2, vcltfpCode, toInt = True)
    twoRegMiscInstFp("vclt", "NVcltQFp", "SimdFloatCmpOp", ("float",),
            4, vcltfpCode, toInt = True)

    vcltfpHCode = '''
        FPSCR fpscr = fpVASimdFPSCRValue((FPSCR)FpscrExc);
        bool test_passed = fplibCompareGT((Element)0, srcElem1, fpscr);
        destElem = test_passed ? 0xFFFF : 0x0000;
        FpscrExc = fpRestoreFPSCRValue(FpscrExc, fpscr);
    '''
    twoRegMiscInst("vclt", "NVcltDFpH", "SimdFloatCmpOp", ("uint16_t",),
            2, vcltfpHCode)
    twoRegMiscInst("vclt", "NVcltQFpH", "SimdFloatCmpOp", ("uint16_t",),
            4, vcltfpHCode)

    vswpCode = '''
        uint32_t mid;
        for (unsigned r = 0; r < rCount; r++) {
            mid = srcReg1.regs[r];
            srcReg1.regs[r] = destReg.regs[r];
            destReg.regs[r] = mid;
        }
    '''
    twoRegMiscScramble("vswp", "NVswpD", "SimdAluOp", ("uint64_t",), 2, vswpCode)
    twoRegMiscScramble("vswp", "NVswpQ", "SimdAluOp", ("uint64_t",), 4, vswpCode)

    vtrnCode = '''
        Element mid;
        for (unsigned i = 0; i < eCount; i += 2) {
            mid = srcReg1.elements[i];
            srcReg1.elements[i] = destReg.elements[i + 1];
            destReg.elements[i + 1] = mid;
        }
    '''
    twoRegMiscScramble("vtrn", "NVtrnD", "SimdAluOp",
            smallUnsignedTypes, 2, vtrnCode)
    twoRegMiscScramble("vtrn", "NVtrnQ", "SimdAluOp",
            smallUnsignedTypes, 4, vtrnCode)

    vuzpCode = '''
        Element mid[eCount];
        memcpy(&mid, &srcReg1, sizeof(srcReg1));
        for (unsigned i = 0; i < eCount / 2; i++) {
            srcReg1.elements[i] = destReg.elements[2 * i + 1];
            srcReg1.elements[eCount / 2 + i] = mid[2 * i + 1];
            destReg.elements[i] = destReg.elements[2 * i];
        }
        for (unsigned i = 0; i < eCount / 2; i++) {
            destReg.elements[eCount / 2 + i] = mid[2 * i];
        }
    '''
    twoRegMiscScramble("vuzp", "NVuzpD", "SimdAluOp", unsignedTypes, 2, vuzpCode)
    twoRegMiscScramble("vuzp", "NVuzpQ", "SimdAluOp", unsignedTypes, 4, vuzpCode)

    vzipCode = '''
        Element mid[eCount];
        memcpy(&mid, &destReg, sizeof(destReg));
        for (unsigned i = 0; i < eCount / 2; i++) {
            destReg.elements[2 * i] = mid[i];
            destReg.elements[2 * i + 1] = srcReg1.elements[i];
        }
        for (int i = 0; i < eCount / 2; i++) {
            srcReg1.elements[2 * i] = mid[eCount / 2 + i];
            srcReg1.elements[2 * i + 1] = srcReg1.elements[eCount / 2 + i];
        }
    '''
    twoRegMiscScramble("vzip", "NVzipD", "SimdAluOp", unsignedTypes, 2, vzipCode)
    twoRegMiscScramble("vzip", "NVzipQ", "SimdAluOp", unsignedTypes, 4, vzipCode)

    vmovnCode = 'destElem = srcElem1;'
    twoRegNarrowMiscInst("vmovn", "NVmovn", "SimdMiscOp", smallUnsignedTypes, vmovnCode)

    vdupCode = 'destElem = srcElem1;'
    twoRegMiscScInst("vdup", "NVdupD", "SimdAluOp", smallUnsignedTypes, 2, vdupCode)
    twoRegMiscScInst("vdup", "NVdupQ", "SimdAluOp", smallUnsignedTypes, 4, vdupCode)

    vmovxCode = '''
        if (i == 0) {
            destElem = letoh(srcRegs1.elements[1]);
        } else {
            destElem = (Element)0;
        }
    '''
    twoRegShiftInst("vmovx", "NVmovx", "SimdMiscOp", ("uint16_t",), 1,
                    vmovxCode, readSrcElem=False)

    vinsCode = '''
        if (i == 1) {
            destElem = letoh(srcRegs1.elements[0]);
        } else {
            destElem = letoh(destRegs.elements[i]);
        }
    '''
    twoRegShiftInst("vins", "NVins", "SimdMiscOp", ("uint16_t",), 1,
                    vinsCode, readDest=True, readSrcElem=False)

    def vdupGprInst(name, Name, opClass, types, rCount):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect destReg;
        for (unsigned i = 0; i < eCount; i++) {
            destReg.elements[i] = htole((Element)Op1);
        }
        '''
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)
    vdupGprInst("vdup", "NVdupDGpr", "SimdMiscOp", smallUnsignedTypes, 2)
    vdupGprInst("vdup", "NVdupQGpr", "SimdMiscOp", smallUnsignedTypes, 4)

    vmovCode = 'destElem = imm;'
    oneRegImmInst("vmov", "NVmoviD", "SimdMiscOp", ("uint64_t",), 2, vmovCode)
    oneRegImmInst("vmov", "NVmoviQ", "SimdMiscOp", ("uint64_t",), 4, vmovCode)

    vorrCode = 'destElem |= imm;'
    oneRegImmInst("vorr", "NVorriD", "SimdAluOp", ("uint64_t",), 2, vorrCode, True)
    oneRegImmInst("vorr", "NVorriQ", "SimdAluOp", ("uint64_t",), 4, vorrCode, True)

    vmvnCode = 'destElem = ~imm;'
    oneRegImmInst("vmvn", "NVmvniD", "SimdAluOp", ("uint64_t",), 2, vmvnCode)
    oneRegImmInst("vmvn", "NVmvniQ", "SimdAluOp", ("uint64_t",), 4, vmvnCode)

    vbicCode = 'destElem &= ~imm;'
    oneRegImmInst("vbic", "NVbiciD", "SimdAluOp", ("uint64_t",), 2, vbicCode, True)
    oneRegImmInst("vbic", "NVbiciQ", "SimdAluOp", ("uint64_t",), 4, vbicCode, True)

    vqmovnCode = '''
    FPSCR fpscr = (FPSCR) FpscrQc;
    destElem = srcElem1;
    if ((BigElement)destElem != srcElem1) {
        fpscr.qc = 1;
        destElem = mask(sizeof(Element) * 8 - 1);
        if (srcElem1 < 0)
            destElem = ~destElem;
    }
    FpscrQc = fpscr;
    '''
    twoRegNarrowMiscInst("vqmovn", "NVqmovn", "SimdMiscOp", smallSignedTypes, vqmovnCode)

    vqmovunCode = '''
    FPSCR fpscr = (FPSCR) FpscrQc;
    destElem = srcElem1;
    if ((BigElement)destElem != srcElem1) {
        fpscr.qc = 1;
        destElem = mask(sizeof(Element) * 8);
    }
    FpscrQc = fpscr;
    '''
    twoRegNarrowMiscInst("vqmovun", "NVqmovun",
            "SimdMiscOp", smallUnsignedTypes, vqmovunCode)

    vqmovunsCode = '''
    FPSCR fpscr = (FPSCR) FpscrQc;
    destElem = srcElem1;
    if (srcElem1 < 0 ||
            ((BigElement)destElem & mask(sizeof(Element) * 8)) != srcElem1) {
        fpscr.qc = 1;
        destElem = mask(sizeof(Element) * 8);
        if (srcElem1 < 0)
            destElem = ~destElem;
    }
    FpscrQc = fpscr;
    '''
    twoRegNarrowMiscInst("vqmovun", "NVqmovuns",
            "SimdMiscOp", smallSignedTypes, vqmovunsCode)

    def buildVext(name, Name, opClass, types, rCount, op):
        global header_output, exec_output
        eWalkCode = simdEnabledCheckCode + '''
        RegVect srcReg1, srcReg2, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
                srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
                srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
            ''' % { "reg" : reg }
        eWalkCode += op
        for reg in range(rCount):
            eWalkCode += '''
            FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
            ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "RegRegRegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += NeonRegRegRegImmOpDeclare.subst(iop)
        exec_output += NeonEqualRegExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonExecDeclare.subst(substDict)

    vextCode = '''
        for (unsigned i = 0; i < eCount; i++) {
            unsigned index = i + imm;
            if (index < eCount) {
                destReg.elements[i] = srcReg1.elements[index];
            } else {
                index -= eCount;
                if (index >= eCount) {
                    fault = std::make_shared<UndefinedInstruction>(machInst,
                                                                   false,
                                                                   mnemonic);
                } else {
                    destReg.elements[i] = srcReg2.elements[index];
                }
            }
        }
    '''
    buildVext("vext", "NVextD", "SimdMiscOp", ("uint8_t",), 2, vextCode)
    buildVext("vext", "NVextQ", "SimdMiscOp", ("uint8_t",), 4, vextCode)

    def buildVtbxl(name, Name, opClass, length, isVtbl):
        global header_output, decoder_output, exec_output
        code = simdEnabledCheckCode + '''
            union
            {
                uint8_t bytes[32];
                uint32_t regs[8];
            } table;

            union
            {
                uint8_t bytes[8];
                uint32_t regs[2];
            } destReg, srcReg2;

            const unsigned length = %(length)d;
            const bool isVtbl = %(isVtbl)s;

            srcReg2.regs[0] = htole(FpOp2P0_uw);
            srcReg2.regs[1] = htole(FpOp2P1_uw);

            destReg.regs[0] = htole(FpDestP0_uw);
            destReg.regs[1] = htole(FpDestP1_uw);
        ''' % { "length" : length, "isVtbl" : isVtbl }
        for reg in range(8):
            if reg < length * 2:
                code += 'table.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);\n' % \
                        { "reg" : reg }
            else:
                code += 'table.regs[%(reg)d] = 0;\n' % { "reg" : reg }
        code += '''
        for (unsigned i = 0; i < sizeof(destReg); i++) {
            uint8_t index = srcReg2.bytes[i];
            if (index < 8 * length) {
                destReg.bytes[i] = table.bytes[index];
            } else {
                if (isVtbl)
                    destReg.bytes[i] = 0;
                // else destReg.bytes[i] unchanged
            }
        }

        FpDestP0_uw = letoh(destReg.regs[0]);
        FpDestP1_uw = letoh(destReg.regs[1]);
        '''
        iop = ArmInstObjParams(name, Name,
                               "RegRegRegOp",
                               { "code": code,
                                 "predicate_test": predicateTest,
                                 "op_class": opClass }, [])
        header_output += RegRegRegOpDeclare.subst(iop)
        decoder_output += RegRegRegOpConstructor.subst(iop)
        exec_output += PredOpExecute.subst(iop)

    buildVtbxl("vtbl", "NVtbl1", "SimdMiscOp", 1, "true")
    buildVtbxl("vtbl", "NVtbl2", "SimdMiscOp", 2, "true")
    buildVtbxl("vtbl", "NVtbl3", "SimdMiscOp", 3, "true")
    buildVtbxl("vtbl", "NVtbl4", "SimdMiscOp", 4, "true")

    buildVtbxl("vtbx", "NVtbx1", "SimdMiscOp", 1, "false")
    buildVtbxl("vtbx", "NVtbx2", "SimdMiscOp", 2, "false")
    buildVtbxl("vtbx", "NVtbx3", "SimdMiscOp", 3, "false")
    buildVtbxl("vtbx", "NVtbx4", "SimdMiscOp", 4, "false")
}};
